All of lore.kernel.org
 help / color / mirror / Atom feed
* [PATCHSET RESEND sched_ext/for-7.2] sched_ext: cmask improvements
@ 2026-05-17 18:36 Tejun Heo
  2026-05-17 18:36 ` [PATCH 1/3] sched_ext: Rename scx_cmask.nr_bits to nr_cids Tejun Heo
                   ` (2 more replies)
  0 siblings, 3 replies; 14+ messages in thread
From: Tejun Heo @ 2026-05-17 18:36 UTC (permalink / raw)
  To: David Vernet, Andrea Righi, Changwoo Min
  Cc: sched-ext, Emil Tsalapatis, linux-kernel, Tejun Heo

Hello,

Resend with the correct cover letter. The earlier posting
(20260517181022.1184056-1-tj@kernel.org) went out under a stale cover
from an unrelated draft; the three patches themselves were the right
ones and are unchanged in this resend. Apologies for the noise.

Three patches for cmask: tidy active-range bookkeeping and add the
mask-on-mask op helpers the sub-sched series will use.

Not backward-compat with the current scx_cmask layout/API, but cmask
landed in for-7.2 and hasn't been released; scx_qmap is the only user.

 0001 - sched_ext: Rename scx_cmask.nr_bits to nr_cids
 0002 - sched_ext: Track bits[] storage size in struct scx_cmask
 0003 - sched_ext: Add cmask mask ops

Based on sched_ext/for-7.2 (c9017d335aab).

Git tree: git://git.kernel.org/pub/scm/linux/kernel/git/tj/sched_ext.git cmask-prep

 kernel/sched/ext_cid.c                | 307 +++++++++++++++++++++++++++++++++-
 kernel/sched/ext_cid.h                |  71 +++++++-
 kernel/sched/ext_types.h              |  64 +++++--
 tools/sched_ext/include/scx/cid.bpf.h | 117 +++++++++----
 4 files changed, 506 insertions(+), 53 deletions(-)

Thanks.

--
tejun

^ permalink raw reply	[flat|nested] 14+ messages in thread

* [PATCH 1/3] sched_ext: Rename scx_cmask.nr_bits to nr_cids
  2026-05-17 18:36 [PATCHSET RESEND sched_ext/for-7.2] sched_ext: cmask improvements Tejun Heo
@ 2026-05-17 18:36 ` Tejun Heo
  2026-05-17 18:43   ` sashiko-bot
  2026-05-17 19:02   ` [PATCH v2 " Tejun Heo
  2026-05-17 18:36 ` [PATCH 2/3] sched_ext: Track bits[] storage size in struct scx_cmask Tejun Heo
  2026-05-17 18:36 ` [PATCH 3/3] sched_ext: Add cmask mask ops Tejun Heo
  2 siblings, 2 replies; 14+ messages in thread
From: Tejun Heo @ 2026-05-17 18:36 UTC (permalink / raw)
  To: David Vernet, Andrea Righi, Changwoo Min
  Cc: sched-ext, Emil Tsalapatis, linux-kernel, Tejun Heo

struct scx_cmask is a base-windowed bitmap over cid space. Each bit
represents one cid, so the count of active bits is the count of cids. The
sibling struct scx_cid_shard already uses nr_cids. Rename as a prep so the
following patches that grow the cmask API can use the consistent name.

Signed-off-by: Tejun Heo <tj@kernel.org>
---
 kernel/sched/ext_cid.h                |  8 ++--
 kernel/sched/ext_types.h              | 12 ++---
 tools/sched_ext/include/scx/cid.bpf.h | 64 +++++++++++++--------------
 3 files changed, 42 insertions(+), 42 deletions(-)

diff --git a/kernel/sched/ext_cid.h b/kernel/sched/ext_cid.h
index f41d48afb7d1..e1c44a180bb1 100644
--- a/kernel/sched/ext_cid.h
+++ b/kernel/sched/ext_cid.h
@@ -138,7 +138,7 @@ static inline bool scx_is_cid_type(void)
 
 static inline bool __scx_cmask_contains(const struct scx_cmask *m, u32 cid)
 {
-	return likely(cid >= m->base && cid < m->base + m->nr_bits);
+	return likely(cid >= m->base && cid < m->base + m->nr_cids);
 }
 
 /* Word in bits[] covering @cid. @cid must satisfy __scx_cmask_contains(). */
@@ -147,11 +147,11 @@ static inline u64 *__scx_cmask_word(const struct scx_cmask *m, u32 cid)
 	return (u64 *)&m->bits[cid / 64 - m->base / 64];
 }
 
-static inline void scx_cmask_init(struct scx_cmask *m, u32 base, u32 nr_bits)
+static inline void scx_cmask_init(struct scx_cmask *m, u32 base, u32 nr_cids)
 {
 	m->base = base;
-	m->nr_bits = nr_bits;
-	memset(m->bits, 0, SCX_CMASK_NR_WORDS(nr_bits) * sizeof(u64));
+	m->nr_cids = nr_cids;
+	memset(m->bits, 0, SCX_CMASK_NR_WORDS(nr_cids) * sizeof(u64));
 }
 
 static inline void __scx_cmask_set(struct scx_cmask *m, u32 cid)
diff --git a/kernel/sched/ext_types.h b/kernel/sched/ext_types.h
index ebb8cdf90612..c6c4e3db7311 100644
--- a/kernel/sched/ext_types.h
+++ b/kernel/sched/ext_types.h
@@ -67,10 +67,10 @@ struct scx_cid_topo {
  * cmask: variable-length, base-windowed bitmap over cid space
  * -----------------------------------------------------------
  *
- * A cmask covers the cid range [base, base + nr_bits). bits[] is aligned to the
+ * A cmask covers the cid range [base, base + nr_cids). bits[] is aligned to the
  * global 64-cid grid: bits[0] spans [base & ~63, (base & ~63) + 64), so the
  * first (base & 63) bits of bits[0] are head padding and any tail past base +
- * nr_bits is tail padding. Both must stay zero for the lifetime of the mask;
+ * nr_cids is tail padding. Both must stay zero for the lifetime of the mask;
  * all mutating helpers preserve that invariant.
  *
  * Grid alignment means two cmasks always address bits[] against the same global
@@ -82,21 +82,21 @@ struct scx_cid_topo {
  */
 struct scx_cmask {
 	u32 base;
-	u32 nr_bits;
+	u32 nr_cids;
 	DECLARE_FLEX_ARRAY(u64, bits);
 };
 
 /*
- * Number of u64 words of bits[] storage that covers @nr_bits regardless of base
+ * Number of u64 words of bits[] storage that covers @nr_cids regardless of base
  * alignment. The +1 absorbs up to 63 bits of head padding when base is not
  * 64-aligned - always allocating one extra word beats branching on base or
  * splitting the compute.
  */
-#define SCX_CMASK_NR_WORDS(nr_bits)	(((nr_bits) + 63) / 64 + 1)
+#define SCX_CMASK_NR_WORDS(nr_cids)	(((nr_cids) + 63) / 64 + 1)
 
 /*
  * Define an on-stack cmask for up to @cap_bits. @name is a struct scx_cmask *
- * aliasing zero-initialized storage; call scx_cmask_init() to set base/nr_bits.
+ * aliasing zero-initialized storage; call scx_cmask_init() to set base/nr_cids.
  */
 #define SCX_CMASK_DEFINE(name, cap_bits)	\
 	DEFINE_RAW_FLEX(struct scx_cmask, name, bits, SCX_CMASK_NR_WORDS(cap_bits))
diff --git a/tools/sched_ext/include/scx/cid.bpf.h b/tools/sched_ext/include/scx/cid.bpf.h
index 7a867e435670..b9dcc14870d3 100644
--- a/tools/sched_ext/include/scx/cid.bpf.h
+++ b/tools/sched_ext/include/scx/cid.bpf.h
@@ -32,11 +32,11 @@
 #define CMASK_MAX_WORDS 129
 #endif
 
-#define CMASK_NR_WORDS(nr_bits)		(((nr_bits) + 63) / 64 + 1)
+#define CMASK_NR_WORDS(nr_cids)		(((nr_cids) + 63) / 64 + 1)
 
 static __always_inline bool __cmask_contains(const struct scx_cmask __arena *m, u32 cid)
 {
-	return cid >= m->base && cid < m->base + m->nr_bits;
+	return cid >= m->base && cid < m->base + m->nr_cids;
 }
 
 static __always_inline u64 __arena *__cmask_word(const struct scx_cmask __arena *m, u32 cid)
@@ -44,12 +44,12 @@ static __always_inline u64 __arena *__cmask_word(const struct scx_cmask __arena
 	return (u64 __arena *)&m->bits[cid / 64 - m->base / 64];
 }
 
-static __always_inline void cmask_init(struct scx_cmask __arena *m, u32 base, u32 nr_bits)
+static __always_inline void cmask_init(struct scx_cmask __arena *m, u32 base, u32 nr_cids)
 {
-	u32 nr_words = CMASK_NR_WORDS(nr_bits), i;
+	u32 nr_words = CMASK_NR_WORDS(nr_cids), i;
 
 	m->base = base;
-	m->nr_bits = nr_bits;
+	m->nr_cids = nr_cids;
 
 	bpf_for(i, 0, CMASK_MAX_WORDS) {
 		if (i >= nr_words)
@@ -206,7 +206,7 @@ static __always_inline bool __cmask_test_and_clear(struct scx_cmask __arena *m,
 
 static __always_inline void cmask_zero(struct scx_cmask __arena *m)
 {
-	u32 nr_words = CMASK_NR_WORDS(m->nr_bits), i;
+	u32 nr_words = CMASK_NR_WORDS(m->nr_cids), i;
 
 	bpf_for(i, 0, CMASK_MAX_WORDS) {
 		if (i >= nr_words)
@@ -250,8 +250,8 @@ static __always_inline void cmask_op_word(struct scx_cmask __arena *dst,
 static __always_inline void cmask_op(struct scx_cmask __arena *dst,
 				     const struct scx_cmask __arena *src, int op)
 {
-	u32 d_end = dst->base + dst->nr_bits;
-	u32 s_end = src->base + src->nr_bits;
+	u32 d_end = dst->base + dst->nr_cids;
+	u32 s_end = src->base + src->nr_cids;
 	u32 lo = dst->base > src->base ? dst->base : src->base;
 	u32 hi = d_end < s_end ? d_end : s_end;
 	u32 d_base = dst->base / 64;
@@ -286,8 +286,8 @@ static __always_inline void cmask_op(struct scx_cmask __arena *dst,
 
 /*
  * cmask_and/or/copy only modify @dst bits that lie in the intersection of
- * [@dst->base, @dst->base + @dst->nr_bits) and [@src->base,
- * @src->base + @src->nr_bits). Bits in @dst outside that window
+ * [@dst->base, @dst->base + @dst->nr_cids) and [@src->base,
+ * @src->base + @src->nr_cids). Bits in @dst outside that window
  * keep their prior values - in particular, cmask_copy() does NOT zero @dst
  * bits that lie outside @src's range.
  */
@@ -325,9 +325,9 @@ static __always_inline bool cmask_equal(const struct scx_cmask __arena *a,
 {
 	u32 nr_words, i;
 
-	if (a->base != b->base || a->nr_bits != b->nr_bits)
+	if (a->base != b->base || a->nr_cids != b->nr_cids)
 		return false;
-	nr_words = CMASK_NR_WORDS(a->nr_bits);
+	nr_words = CMASK_NR_WORDS(a->nr_cids);
 
 	bpf_for(i, 0, CMASK_MAX_WORDS) {
 		if (i >= nr_words)
@@ -345,8 +345,8 @@ static __always_inline bool cmask_equal(const struct scx_cmask __arena *a,
 static __always_inline bool cmask_subset(const struct scx_cmask __arena *a,
 					 const struct scx_cmask __arena *b)
 {
-	u32 a_end = a->base + a->nr_bits;
-	u32 b_end = b->base + b->nr_bits;
+	u32 a_end = a->base + a->nr_cids;
+	u32 b_end = b->base + b->nr_cids;
 	u32 a_wbase = a->base / 64;
 	u32 b_wbase = b->base / 64;
 	u32 nr_words, i;
@@ -355,7 +355,7 @@ static __always_inline bool cmask_subset(const struct scx_cmask __arena *a,
 	if (a->base < b->base || a_end > b_end)
 		return false;
 
-	nr_words = CMASK_NR_WORDS(a->nr_bits);
+	nr_words = CMASK_NR_WORDS(a->nr_cids);
 	bpf_for(i, 0, CMASK_MAX_WORDS) {
 		u32 wi_b;
 
@@ -373,13 +373,13 @@ static __always_inline bool cmask_subset(const struct scx_cmask __arena *a,
  * @m: cmask to search
  * @cid: starting cid (clamped to @m->base if below)
  *
- * Returns the smallest set cid in [@cid, @m->base + @m->nr_bits), or
- * @m->base + @m->nr_bits if none (the out-of-range sentinel matches the
+ * Returns the smallest set cid in [@cid, @m->base + @m->nr_cids), or
+ * @m->base + @m->nr_cids if none (the out-of-range sentinel matches the
  * termination condition used by cmask_for_each()).
  */
 static __always_inline u32 cmask_next_set(const struct scx_cmask __arena *m, u32 cid)
 {
-	u32 end = m->base + m->nr_bits;
+	u32 end = m->base + m->nr_cids;
 	u32 base = m->base / 64;
 	u32 last_wi = (end - 1) / 64 - base;
 	u32 start_wi, start_bit, i;
@@ -421,17 +421,17 @@ static __always_inline u32 cmask_first_set(const struct scx_cmask __arena *m)
 
 #define cmask_for_each(cid, m)							\
 	for ((cid) = cmask_first_set(m);					\
-	     (cid) < (m)->base + (m)->nr_bits;					\
+	     (cid) < (m)->base + (m)->nr_cids;					\
 	     (cid) = cmask_next_set((m), (cid) + 1))
 
 /*
- * Population count over [base, base + nr_bits). Padding bits in the head/tail
+ * Population count over [base, base + nr_cids). Padding bits in the head/tail
  * words are guaranteed zero by the mutating helpers, so a flat popcount over
  * all words is correct.
  */
 static __always_inline u32 cmask_weight(const struct scx_cmask __arena *m)
 {
-	u32 nr_words = CMASK_NR_WORDS(m->nr_bits), i;
+	u32 nr_words = CMASK_NR_WORDS(m->nr_cids), i;
 	u32 count = 0;
 
 	bpf_for(i, 0, CMASK_MAX_WORDS) {
@@ -449,8 +449,8 @@ static __always_inline u32 cmask_weight(const struct scx_cmask __arena *m)
 static __always_inline bool cmask_intersects(const struct scx_cmask __arena *a,
 					     const struct scx_cmask __arena *b)
 {
-	u32 a_end = a->base + a->nr_bits;
-	u32 b_end = b->base + b->nr_bits;
+	u32 a_end = a->base + a->nr_cids;
+	u32 b_end = b->base + b->nr_cids;
 	u32 lo = a->base > b->base ? a->base : b->base;
 	u32 hi = a_end < b_end ? a_end : b_end;
 	u32 a_base = a->base / 64;
@@ -489,7 +489,7 @@ static __always_inline bool cmask_intersects(const struct scx_cmask __arena *a,
 
 /*
  * Find the next cid set in both @a and @b at or after @start, bounded by the
- * intersection of the two ranges. Return a->base + a->nr_bits if none found.
+ * intersection of the two ranges. Return a->base + a->nr_cids if none found.
  *
  * Building block for cmask_next_and_set_wrap(). Callers that want a bounded
  * scan without wrap call this directly.
@@ -498,8 +498,8 @@ static __always_inline u32 cmask_next_and_set(const struct scx_cmask __arena *a,
 					      const struct scx_cmask __arena *b,
 					      u32 start)
 {
-	u32 a_end = a->base + a->nr_bits;
-	u32 b_end = b->base + b->nr_bits;
+	u32 a_end = a->base + a->nr_cids;
+	u32 b_end = b->base + b->nr_cids;
 	u32 a_wbase = a->base / 64;
 	u32 b_wbase = b->base / 64;
 	u32 lo = a->base > b->base ? a->base : b->base;
@@ -541,15 +541,15 @@ static __always_inline u32 cmask_next_and_set(const struct scx_cmask __arena *a,
 
 /*
  * Find the next set cid in @m at or after @start, wrapping to @m->base if no
- * set bit is found in [start, m->base + m->nr_bits). Return m->base +
- * m->nr_bits if @m is empty.
+ * set bit is found in [start, m->base + m->nr_cids). Return m->base +
+ * m->nr_cids if @m is empty.
  *
  * Callers do round-robin distribution by passing (last_cid + 1) as @start.
  */
 static __always_inline u32 cmask_next_set_wrap(const struct scx_cmask __arena *m,
 					       u32 start)
 {
-	u32 end = m->base + m->nr_bits;
+	u32 end = m->base + m->nr_cids;
 	u32 found;
 
 	found = cmask_next_set(m, start);
@@ -562,7 +562,7 @@ static __always_inline u32 cmask_next_set_wrap(const struct scx_cmask __arena *m
 
 /*
  * Find the next cid set in both @a and @b at or after @start, wrapping to
- * @a->base if none found in the forward half. Return a->base + a->nr_bits
+ * @a->base if none found in the forward half. Return a->base + a->nr_cids
  * if the intersection is empty.
  *
  * Callers do round-robin distribution by passing (last_cid + 1) as @start.
@@ -571,7 +571,7 @@ static __always_inline u32 cmask_next_and_set_wrap(const struct scx_cmask __aren
 						   const struct scx_cmask __arena *b,
 						   u32 start)
 {
-	u32 a_end = a->base + a->nr_bits;
+	u32 a_end = a->base + a->nr_cids;
 	u32 found;
 
 	found = cmask_next_and_set(a, b, start);
@@ -585,7 +585,7 @@ static __always_inline u32 cmask_next_and_set_wrap(const struct scx_cmask __aren
 /**
  * cmask_from_cpumask - translate a kernel cpumask to a cid-space cmask
  * @m: cmask to fill. Zeroed first; only bits within [@m->base, @m->base +
- *     @m->nr_bits) are updated - cpus mapping to cids outside that range
+ *     @m->nr_cids) are updated - cpus mapping to cids outside that range
  *     are ignored.
  * @cpumask: kernel cpumask to translate
  *
-- 
2.54.0


^ permalink raw reply related	[flat|nested] 14+ messages in thread

* [PATCH 2/3] sched_ext: Track bits[] storage size in struct scx_cmask
  2026-05-17 18:36 [PATCHSET RESEND sched_ext/for-7.2] sched_ext: cmask improvements Tejun Heo
  2026-05-17 18:36 ` [PATCH 1/3] sched_ext: Rename scx_cmask.nr_bits to nr_cids Tejun Heo
@ 2026-05-17 18:36 ` Tejun Heo
  2026-05-17 19:14   ` sashiko-bot
  2026-05-17 19:29   ` [PATCH v2 " Tejun Heo
  2026-05-17 18:36 ` [PATCH 3/3] sched_ext: Add cmask mask ops Tejun Heo
  2 siblings, 2 replies; 14+ messages in thread
From: Tejun Heo @ 2026-05-17 18:36 UTC (permalink / raw)
  To: David Vernet, Andrea Righi, Changwoo Min
  Cc: sched-ext, Emil Tsalapatis, linux-kernel, Tejun Heo

scx_cmask carries @base and @nr_cids but not the bits[] allocation size, so
helpers reshaping the active range have no way to check it fits and later
kfuncs taking caller-provided storage can't validate it.

Add @alloc_words (u64 word count) annotated with __counted_by, and split the
bit-range API into three helpers:

- SCX_CMASK_DEFINE() / __SCX_CMASK_DEFINE() define an on-stack cmask, the
  latter taking an explicit capacity for oversized storage.
  SCX_CMASK_DEFINE_SHARD() is a thin wrapper that always reserves
  SCX_CID_SHARD_MAX_CPUS bits of storage.

- scx_cmask_init() / __scx_cmask_init() initialize a cmask, with the same
  tight-vs-explicit split.

- scx_cmask_reframe() reshapes the active range without resizing storage.

The BPF mirror (cmask_init / __cmask_init / cmask_reframe) gets the same
shape.

Add scx_cmask_clear() and scx_cmask_fill() to zero and set the
active-range bits respectively. scx_cpumask_to_cmask() uses
scx_cmask_clear(); scx_cmask_init() would otherwise re-write @alloc_words
on every call.

A later patch uses @alloc_words in scx_cmask_ref_shard() to refuse output
storage that can't hold the requested shard.

Signed-off-by: Tejun Heo <tj@kernel.org>
---
 kernel/sched/ext_cid.c                | 48 ++++++++++++++++++++--
 kernel/sched/ext_cid.h                | 57 ++++++++++++++++++++++++++-
 kernel/sched/ext_types.h              | 56 +++++++++++++++++++++-----
 tools/sched_ext/include/scx/cid.bpf.h | 57 +++++++++++++++++++++++++--
 4 files changed, 202 insertions(+), 16 deletions(-)

diff --git a/kernel/sched/ext_cid.c b/kernel/sched/ext_cid.c
index bdd8ef8eae3d..f50319c5c65e 100644
--- a/kernel/sched/ext_cid.c
+++ b/kernel/sched/ext_cid.c
@@ -222,19 +222,61 @@ s32 scx_cid_init(struct scx_sched *sch)
 	return 0;
 }
 
+/**
+ * scx_cmask_clear - Zero every bit in @m's active range
+ * @m: cmask to clear
+ *
+ * Storage past the active range is left as is.
+ */
+void scx_cmask_clear(struct scx_cmask *m)
+{
+	u32 nr_words;
+
+	if (!m->nr_cids)
+		return;
+	nr_words = (m->base + m->nr_cids - 1) / 64 - m->base / 64 + 1;
+	memset(m->bits, 0, nr_words * sizeof(u64));
+}
+
+/**
+ * scx_cmask_fill - Set every bit in @m's active range
+ * @m: cmask to fill
+ *
+ * Counterpart to scx_cmask_clear(). Storage past the active range is left as is.
+ */
+void scx_cmask_fill(struct scx_cmask *m)
+{
+	u32 nr_words, head_bits, tail_bits;
+
+	if (!m->nr_cids)
+		return;
+	nr_words = (m->base + m->nr_cids - 1) / 64 - m->base / 64 + 1;
+	memset(m->bits, 0xff, nr_words * sizeof(u64));
+
+	/* clear word-0 bits below base */
+	head_bits = m->base & 63;
+	if (head_bits)
+		m->bits[0] &= ~((1ULL << head_bits) - 1);
+
+	/* clear last-word bits at or past base + nr_cids */
+	tail_bits = (m->base + m->nr_cids) & 63;
+	if (tail_bits)
+		m->bits[nr_words - 1] &= (1ULL << tail_bits) - 1;
+}
+
 /**
  * scx_cpumask_to_cmask - Translate a kernel cpumask into a cmask
  * @src: source cpumask
  * @dst: cmask to write
  *
- * Initialize @dst to cover the full cid space [0, num_possible_cpus()) and
- * set the bit for each cid whose cpu is in @src.
+ * Clear @dst's active range and set the bit for each cid whose cpu is in
+ * @src and lies within that range. Out-of-range cids are silently ignored.
  */
 void scx_cpumask_to_cmask(const struct cpumask *src, struct scx_cmask *dst)
 {
 	s32 cpu;
 
-	scx_cmask_init(dst, 0, num_possible_cpus());
+	scx_cmask_clear(dst);
 	for_each_cpu(cpu, src) {
 		s32 cid = __scx_cpu_to_cid(cpu);
 
diff --git a/kernel/sched/ext_cid.h b/kernel/sched/ext_cid.h
index e1c44a180bb1..223ed0e857ec 100644
--- a/kernel/sched/ext_cid.h
+++ b/kernel/sched/ext_cid.h
@@ -51,6 +51,8 @@ extern s16 *scx_cpu_to_cid_tbl;
 extern struct scx_cid_topo *scx_cid_topo;
 extern struct btf_id_set8 scx_kfunc_ids_init;
 
+void scx_cmask_clear(struct scx_cmask *m);
+void scx_cmask_fill(struct scx_cmask *m);
 s32 scx_cid_init(struct scx_sched *sch);
 int scx_cid_kfunc_init(void);
 void scx_cpumask_to_cmask(const struct cpumask *src, struct scx_cmask *dst);
@@ -147,11 +149,64 @@ static inline u64 *__scx_cmask_word(const struct scx_cmask *m, u32 cid)
 	return (u64 *)&m->bits[cid / 64 - m->base / 64];
 }
 
+/**
+ * __scx_cmask_init - Initialize @m with explicit storage capacity
+ * @m: cmask to initialize
+ * @base: first cid of the active range
+ * @nr_cids: number of cids in the active range
+ * @alloc_cids: storage capacity in cids, at least @nr_cids
+ *
+ * Use when storage is sized larger than the initial active range. All of
+ * bits[] is zeroed.
+ */
+static inline void __scx_cmask_init(struct scx_cmask *m, u32 base, u32 nr_cids,
+				    u32 alloc_cids)
+{
+	if (WARN_ON_ONCE(alloc_cids < nr_cids))
+		nr_cids = alloc_cids;
+
+	m->base = base;
+	m->nr_cids = nr_cids;
+	m->alloc_words = SCX_CMASK_NR_WORDS(alloc_cids);
+	memset(m->bits, 0, m->alloc_words * sizeof(u64));
+}
+
+/**
+ * scx_cmask_init - Initialize @m on tight storage
+ * @m: cmask to initialize
+ * @base: first cid of the active range
+ * @nr_cids: number of cids in the active range
+ *
+ * All of bits[] is zeroed.
+ */
 static inline void scx_cmask_init(struct scx_cmask *m, u32 base, u32 nr_cids)
 {
+	__scx_cmask_init(m, base, nr_cids, nr_cids);
+}
+
+/**
+ * scx_cmask_reframe - Reshape @m's active range without resizing storage
+ * @m: cmask to reframe
+ * @base: new active range base
+ * @nr_cids: new active range length, must fit within @m->alloc_words
+ *
+ * Body bits within the new range become garbage - only the head and tail
+ * words are zeroed to keep the padding invariant.
+ */
+static inline void scx_cmask_reframe(struct scx_cmask *m, u32 base, u32 nr_cids)
+{
+	if (WARN_ON_ONCE(SCX_CMASK_NR_WORDS(nr_cids) > m->alloc_words))
+		return;
+
+	if (nr_cids) {
+		u32 last_word = ((base & 63) + nr_cids - 1) / 64;
+
+		m->bits[0] = 0;
+		m->bits[last_word] = 0;
+	}
+
 	m->base = base;
 	m->nr_cids = nr_cids;
-	memset(m->bits, 0, SCX_CMASK_NR_WORDS(nr_cids) * sizeof(u64));
 }
 
 static inline void __scx_cmask_set(struct scx_cmask *m, u32 cid)
diff --git a/kernel/sched/ext_types.h b/kernel/sched/ext_types.h
index c6c4e3db7311..0c318a359849 100644
--- a/kernel/sched/ext_types.h
+++ b/kernel/sched/ext_types.h
@@ -69,9 +69,10 @@ struct scx_cid_topo {
  *
  * A cmask covers the cid range [base, base + nr_cids). bits[] is aligned to the
  * global 64-cid grid: bits[0] spans [base & ~63, (base & ~63) + 64), so the
- * first (base & 63) bits of bits[0] are head padding and any tail past base +
- * nr_cids is tail padding. Both must stay zero for the lifetime of the mask;
- * all mutating helpers preserve that invariant.
+ * first (base & 63) bits of bits[0] are head padding and the trailing bits of
+ * the last active word past base + nr_cids are tail padding. Both stay zero;
+ * all mutating helpers preserve that. Words past the last active word are not
+ * read by any helper and have no constraint.
  *
  * Grid alignment means two cmasks always address bits[] against the same global
  * 64-cid windows, so cross-cmask word ops (AND, OR, ...) reduce to
@@ -83,7 +84,8 @@ struct scx_cid_topo {
 struct scx_cmask {
 	u32 base;
 	u32 nr_cids;
-	DECLARE_FLEX_ARRAY(u64, bits);
+	u32 alloc_words;
+	u64 bits[] __counted_by(alloc_words);
 };
 
 /*
@@ -94,11 +96,47 @@ struct scx_cmask {
  */
 #define SCX_CMASK_NR_WORDS(nr_cids)	(((nr_cids) + 63) / 64 + 1)
 
-/*
- * Define an on-stack cmask for up to @cap_bits. @name is a struct scx_cmask *
- * aliasing zero-initialized storage; call scx_cmask_init() to set base/nr_cids.
+/**
+ * __SCX_CMASK_DEFINE - Define an on-stack cmask with explicit storage capacity
+ * @NAME: variable name to define
+ * @BASE: first cid of the active range
+ * @NR_CIDS: active range length
+ * @ALLOC_CIDS: storage capacity in cids, at least @NR_CIDS
+ *
+ * @NAME aliases zero-initialized storage with the active range set to
+ * [BASE, BASE + NR_CIDS). Use scx_cmask_reframe() to reshape later, up to
+ * @ALLOC_CIDS.
+ */
+#define __SCX_CMASK_DEFINE(NAME, BASE, NR_CIDS, ALLOC_CIDS)			\
+	_DEFINE_FLEX(struct scx_cmask, NAME, bits, SCX_CMASK_NR_WORDS(ALLOC_CIDS), \
+		     = { .base = (BASE),					\
+			 .nr_cids = (NR_CIDS),					\
+			 .alloc_words = SCX_CMASK_NR_WORDS(ALLOC_CIDS) })
+
+/**
+ * SCX_CMASK_DEFINE - Define an on-stack cmask on tight storage
+ * @NAME: variable name to define
+ * @BASE: first cid of the active range
+ * @NR_CIDS: active range length, also storage capacity
+ *
+ * @NAME aliases zero-initialized storage with the active range and storage
+ * both [BASE, BASE + NR_CIDS).
+ */
+#define SCX_CMASK_DEFINE(NAME, BASE, NR_CIDS)					\
+	__SCX_CMASK_DEFINE(NAME, BASE, NR_CIDS, NR_CIDS)
+
+/**
+ * SCX_CMASK_DEFINE_SHARD - Define an on-stack cmask sized to one shard
+ * @NAME: variable name to define
+ * @BASE: first cid of the active range
+ * @NR_CIDS: active range length, must be <= SCX_CID_SHARD_MAX_CPUS
+ *
+ * Storage is fixed at SCX_CID_SHARD_MAX_CPUS, active range framed by
+ * (BASE, NR_CIDS). Passing NR_CIDS > SCX_CID_SHARD_MAX_CPUS leaves the
+ * cmask claiming more bits than storage holds and subsequent cmask
+ * operations will overrun.
  */
-#define SCX_CMASK_DEFINE(name, cap_bits)	\
-	DEFINE_RAW_FLEX(struct scx_cmask, name, bits, SCX_CMASK_NR_WORDS(cap_bits))
+#define SCX_CMASK_DEFINE_SHARD(NAME, BASE, NR_CIDS)				\
+	__SCX_CMASK_DEFINE(NAME, BASE, NR_CIDS, SCX_CID_SHARD_MAX_CPUS)
 
 #endif /* _KERNEL_SCHED_EXT_TYPES_H */
diff --git a/tools/sched_ext/include/scx/cid.bpf.h b/tools/sched_ext/include/scx/cid.bpf.h
index b9dcc14870d3..211f4077f43a 100644
--- a/tools/sched_ext/include/scx/cid.bpf.h
+++ b/tools/sched_ext/include/scx/cid.bpf.h
@@ -44,20 +44,71 @@ static __always_inline u64 __arena *__cmask_word(const struct scx_cmask __arena
 	return (u64 __arena *)&m->bits[cid / 64 - m->base / 64];
 }
 
-static __always_inline void cmask_init(struct scx_cmask __arena *m, u32 base, u32 nr_cids)
+/**
+ * __cmask_init - Initialize @m with explicit storage capacity
+ * @m: cmask to initialize
+ * @base: first cid of the active range
+ * @nr_cids: number of cids in the active range
+ * @alloc_cids: storage capacity in cids, at least @nr_cids
+ *
+ * Use when storage is sized larger than the initial active range. All of
+ * bits[] is zeroed.
+ */
+static __always_inline void __cmask_init(struct scx_cmask __arena *m, u32 base,
+					 u32 nr_cids, u32 alloc_cids)
 {
-	u32 nr_words = CMASK_NR_WORDS(nr_cids), i;
+	u32 alloc_words = CMASK_NR_WORDS(alloc_cids), i;
 
 	m->base = base;
 	m->nr_cids = nr_cids;
+	m->alloc_words = alloc_words;
 
 	bpf_for(i, 0, CMASK_MAX_WORDS) {
-		if (i >= nr_words)
+		if (i >= alloc_words)
 			break;
 		m->bits[i] = 0;
 	}
 }
 
+/**
+ * cmask_init - Initialize @m on tight storage
+ * @m: cmask to initialize
+ * @base: first cid of the active range
+ * @nr_cids: number of cids in the active range
+ *
+ * All of bits[] is zeroed.
+ */
+static __always_inline void cmask_init(struct scx_cmask __arena *m, u32 base, u32 nr_cids)
+{
+	__cmask_init(m, base, nr_cids, nr_cids);
+}
+
+/**
+ * cmask_reframe - Reshape @m's active range without resizing storage
+ * @m: cmask to reframe
+ * @base: new active range base
+ * @nr_cids: new active range length, must fit within @m->alloc_words
+ *
+ * Body bits within the new range become garbage - only the head and tail
+ * words are zeroed to keep the padding invariant.
+ */
+static __always_inline void cmask_reframe(struct scx_cmask __arena *m, u32 base, u32 nr_cids)
+{
+	if (CMASK_NR_WORDS(nr_cids) > m->alloc_words) {
+		scx_bpf_error("cmask_reframe: nr_cids=%u exceeds alloc_words=%u",
+			      nr_cids, m->alloc_words);
+		return;
+	}
+	if (nr_cids) {
+		u32 last_word = ((base & 63) + nr_cids - 1) / 64;
+
+		m->bits[0] = 0;
+		m->bits[last_word] = 0;
+	}
+	m->base = base;
+	m->nr_cids = nr_cids;
+}
+
 static __always_inline bool cmask_test(const struct scx_cmask __arena *m, u32 cid)
 {
 	if (!__cmask_contains(m, cid))
-- 
2.54.0


^ permalink raw reply related	[flat|nested] 14+ messages in thread

* [PATCH 3/3] sched_ext: Add cmask mask ops
  2026-05-17 18:36 [PATCHSET RESEND sched_ext/for-7.2] sched_ext: cmask improvements Tejun Heo
  2026-05-17 18:36 ` [PATCH 1/3] sched_ext: Rename scx_cmask.nr_bits to nr_cids Tejun Heo
  2026-05-17 18:36 ` [PATCH 2/3] sched_ext: Track bits[] storage size in struct scx_cmask Tejun Heo
@ 2026-05-17 18:36 ` Tejun Heo
  2026-05-18 23:58   ` [PATCH v2 " Tejun Heo
  2 siblings, 1 reply; 14+ messages in thread
From: Tejun Heo @ 2026-05-17 18:36 UTC (permalink / raw)
  To: David Vernet, Andrea Righi, Changwoo Min
  Cc: sched-ext, Emil Tsalapatis, linux-kernel, Tejun Heo

Sub-sched cap code and other upcoming consumers need bulk cmask ops, both
mutating (and/or/copy/andnot) and predicate (subset/intersects).

cmask_walk_op2() walks the intersection of two ranges word by word;
cmask_walk_op1() walks one range. Both are __always_inline and dispatched on
a compile-time-constant op enum, so each public entry collapses to a
specialized loop with the inner switch reduced to one arm.

Two-cmask ops only touch bits in the intersection of the two ranges; bits
outside are left unchanged. scx_cmask_or_racy() and scx_cmask_copy_racy()
mirror the locking forms but read @src word-by-word through data_race();
callers handle ordering with concurrent writers themselves.

Signed-off-by: Tejun Heo <tj@kernel.org>
---
 kernel/sched/ext_cid.c | 259 +++++++++++++++++++++++++++++++++++++++++
 kernel/sched/ext_cid.h |   8 ++
 2 files changed, 267 insertions(+)

diff --git a/kernel/sched/ext_cid.c b/kernel/sched/ext_cid.c
index f50319c5c65e..76c83caba70a 100644
--- a/kernel/sched/ext_cid.c
+++ b/kernel/sched/ext_cid.c
@@ -393,6 +393,265 @@ __bpf_kfunc s32 scx_bpf_cpu_to_cid(s32 cpu, const struct bpf_prog_aux *aux)
 	return scx_cpu_to_cid(sch, cpu);
 }
 
+/*
+ * Set ops on cmasks. cmask_walk_op2() shares one walk across mutating
+ * (and/or/copy/andnot) and predicate (subset/intersects) two-cmask forms;
+ * cmask_walk_op1() does the same shape over a single cmask range. Every public
+ * entry passes a compile-time-constant @op; cmask_walk_op{1,2}() and
+ * cmask_word_op{1,2}() are __always_inline so the inner switch collapses to the
+ * selected op and cmask_op2_is_pred() folds the predicate early-exit out of
+ * mutating ops.
+ *
+ * Two-cmask ops only touch @dst bits inside the intersection of the two ranges;
+ * bits outside stay untouched. In particular, scx_cmask_copy() does NOT zero
+ * @dst bits that lie outside @src's range.
+ *
+ * The _RACY variants are otherwise identical to their non-racy counterpart but
+ * read @src word-by-word via data_race(). Memory ordering with concurrent
+ * writers is the caller's responsibility.
+ */
+enum cmask_op2 {
+	/* mutating */
+	CMASK_OP2_AND,
+	CMASK_OP2_OR,
+	CMASK_OP2_OR_RACY,
+	CMASK_OP2_COPY,
+	CMASK_OP2_COPY_RACY,
+	CMASK_OP2_ANDNOT,
+	/* predicates - short-circuit when the per-word result is true */
+	CMASK_OP2_SUBSET,
+	CMASK_OP2_INTERSECTS,
+};
+
+static __always_inline bool cmask_op2_is_pred(const enum cmask_op2 op)
+{
+	return op == CMASK_OP2_SUBSET || op == CMASK_OP2_INTERSECTS;
+}
+
+static __always_inline bool cmask_word_op2(u64 *av, const u64 *bp, u64 mask,
+					   const enum cmask_op2 op)
+{
+	switch (op) {
+	case CMASK_OP2_AND:
+		*av &= ~mask | *bp;
+		return false;
+	case CMASK_OP2_OR:
+		*av |= *bp & mask;
+		return false;
+	case CMASK_OP2_OR_RACY:
+		*av |= data_race(*bp) & mask;
+		return false;
+	case CMASK_OP2_COPY:
+		*av = (*av & ~mask) | (*bp & mask);
+		return false;
+	case CMASK_OP2_COPY_RACY:
+		*av = (*av & ~mask) | (data_race(*bp) & mask);
+		return false;
+	case CMASK_OP2_ANDNOT:
+		*av &= ~(*bp & mask);
+		return false;
+	case CMASK_OP2_SUBSET:
+		/* stop on the first bit in @sub not set in @super */
+		return (*bp & ~*av) & mask;
+	case CMASK_OP2_INTERSECTS:
+		return (*av & *bp) & mask;
+	}
+	unreachable();
+}
+
+/*
+ * Walk the intersection of [@a_base, @a_base + @a_nr_cids) with [@b_base,
+ * @b_base + @b_nr_cids) word by word, applying @op. Mutating ops walk all words
+ * and return false; predicates return true on the first word whose per-word
+ * test is true. Empty intersection returns false (matches "no bits to consider"
+ * for both mutate and predicate).
+ *
+ * Base/nr_cids are taken as parameters so callers with snapshotted bounds can
+ * drive the walk with values independent of the cmask's header.
+ */
+static __always_inline bool cmask_walk_op2(u64 *a_bits, u32 a_base, u32 a_nr_cids,
+					   const u64 *b_bits, u32 b_base, u32 b_nr_cids,
+					   const enum cmask_op2 op)
+{
+	u32 lo = max(a_base, b_base);
+	u32 hi = min(a_base + a_nr_cids, b_base + b_nr_cids);
+	u32 a_word_off = a_base / 64;
+	u32 b_word_off = b_base / 64;
+	u32 lo_word = lo / 64;
+	u32 hi_word = (hi - 1) / 64;
+	u64 head_mask = GENMASK_U64(63, lo & 63);
+	u64 tail_mask = GENMASK_U64((hi - 1) & 63, 0);
+	u32 w;
+
+	if (lo >= hi)
+		return false;
+
+	if (lo_word == hi_word)
+		return cmask_word_op2(&a_bits[lo_word - a_word_off],
+				      &b_bits[lo_word - b_word_off],
+				      head_mask & tail_mask, op);
+
+	if (cmask_word_op2(&a_bits[lo_word - a_word_off],
+			   &b_bits[lo_word - b_word_off], head_mask, op) &&
+	    cmask_op2_is_pred(op))
+		return true;
+
+	for (w = lo_word + 1; w < hi_word; w++)
+		if (cmask_word_op2(&a_bits[w - a_word_off],
+				   &b_bits[w - b_word_off], ~0ULL, op) &&
+		    cmask_op2_is_pred(op))
+			return true;
+
+	return cmask_word_op2(&a_bits[hi_word - a_word_off],
+			      &b_bits[hi_word - b_word_off], tail_mask, op);
+}
+
+enum cmask_op1 {
+	CMASK_OP1_ANY_SET,
+};
+
+static __always_inline bool cmask_word_op1(const u64 *ap, u64 mask,
+					   const enum cmask_op1 op)
+{
+	switch (op) {
+	case CMASK_OP1_ANY_SET:
+		return *ap & mask;
+	}
+	unreachable();
+}
+
+/*
+ * Walk [@a_base, @a_base + @a_nr_cids) of @a_bits word by word, applying @op.
+ * Returns true on the first word whose per-word test is true; returns false if
+ * no word matches or the range is empty. All current op1s short-circuit on
+ * per-word true; if a non-predicate op1 lands here, add a cmask_op1_is_pred()
+ * guard analogous to cmask_op2_is_pred().
+ */
+static __always_inline bool cmask_walk_op1(const u64 *a_bits, u32 a_base,
+					   u32 a_nr_cids,
+					   const enum cmask_op1 op)
+{
+	u32 lo = a_base;
+	u32 hi = a_base + a_nr_cids;
+	u32 a_word_off = a_base / 64;
+	u32 lo_word = lo / 64;
+	u32 hi_word = (hi - 1) / 64;
+	u64 head_mask = GENMASK_U64(63, lo & 63);
+	u64 tail_mask = GENMASK_U64((hi - 1) & 63, 0);
+	u32 w;
+
+	if (lo >= hi)
+		return false;
+
+	if (lo_word == hi_word)
+		return cmask_word_op1(&a_bits[lo_word - a_word_off],
+				      head_mask & tail_mask, op);
+
+	if (cmask_word_op1(&a_bits[lo_word - a_word_off], head_mask, op))
+		return true;
+	for (w = lo_word + 1; w < hi_word; w++)
+		if (cmask_word_op1(&a_bits[w - a_word_off], ~0ULL, op))
+			return true;
+	return cmask_word_op1(&a_bits[hi_word - a_word_off], tail_mask, op);
+}
+
+void scx_cmask_and(struct scx_cmask *dst, const struct scx_cmask *src)
+{
+	cmask_walk_op2(dst->bits, dst->base, dst->nr_cids,
+		       src->bits, src->base, src->nr_cids, CMASK_OP2_AND);
+}
+
+void scx_cmask_or(struct scx_cmask *dst, const struct scx_cmask *src)
+{
+	cmask_walk_op2(dst->bits, dst->base, dst->nr_cids,
+		       src->bits, src->base, src->nr_cids, CMASK_OP2_OR);
+}
+
+/**
+ * scx_cmask_or_racy - OR @src into @dst, reading @src without locking
+ *
+ * @src is read word-by-word through data_race(). Same per-bit independence
+ * rationale as scx_cmask_copy_racy(). Memory ordering with writers is the
+ * caller's responsibility.
+ */
+void scx_cmask_or_racy(struct scx_cmask *dst, const struct scx_cmask *src)
+{
+	cmask_walk_op2(dst->bits, dst->base, dst->nr_cids,
+		       src->bits, src->base, src->nr_cids, CMASK_OP2_OR_RACY);
+}
+
+void scx_cmask_copy(struct scx_cmask *dst, const struct scx_cmask *src)
+{
+	cmask_walk_op2(dst->bits, dst->base, dst->nr_cids,
+		       src->bits, src->base, src->nr_cids, CMASK_OP2_COPY);
+}
+
+/**
+ * scx_cmask_copy_racy - Snapshot @src into @dst without locking
+ *
+ * @src is read word-by-word through data_race(). Head/tail masking matches
+ * scx_cmask_copy(). Each bit in a cmask is independent, so partial updates
+ * just leave some bits fresher than others. Memory ordering with writers is
+ * the caller's responsibility.
+ */
+void scx_cmask_copy_racy(struct scx_cmask *dst, const struct scx_cmask *src)
+{
+	cmask_walk_op2(dst->bits, dst->base, dst->nr_cids,
+		       src->bits, src->base, src->nr_cids, CMASK_OP2_COPY_RACY);
+}
+
+void scx_cmask_andnot(struct scx_cmask *dst, const struct scx_cmask *src)
+{
+	cmask_walk_op2(dst->bits, dst->base, dst->nr_cids,
+		       src->bits, src->base, src->nr_cids, CMASK_OP2_ANDNOT);
+}
+
+/*
+ * Return true if @cm has any bit set in [@lo, @hi). Caller must ensure
+ * [@lo, @hi) is contained in @cm's range.
+ */
+static bool cmask_any_set_in_range(const struct scx_cmask *cm, u32 lo, u32 hi)
+{
+	if (lo >= hi)
+		return false;
+	return cmask_walk_op1(&cm->bits[lo / 64 - cm->base / 64], lo, hi - lo,
+			      CMASK_OP1_ANY_SET);
+}
+
+/**
+ * scx_cmask_subset - test whether @sub is a subset of @super
+ * @sub: cmask to test
+ * @super: cmask to test against
+ *
+ * Return true iff every set bit of @sub is also set in @super.
+ */
+bool scx_cmask_subset(const struct scx_cmask *sub, const struct scx_cmask *super)
+{
+	u32 super_end = super->base + super->nr_cids;
+	u32 sub_end = sub->base + sub->nr_cids;
+
+	/*
+	 * Set bits in @sub outside @super's range can't be in @super, so any
+	 * such bit means not a subset. The walk below only visits words
+	 * common to both ranges, so these need a separate scan.
+	 */
+	if (sub->base < super->base &&
+	    cmask_any_set_in_range(sub, sub->base, min(super->base, sub_end)))
+		return false;
+	if (sub_end > super_end &&
+	    cmask_any_set_in_range(sub, max(sub->base, super_end), sub_end))
+		return false;
+
+	return !cmask_walk_op2((u64 *)super->bits, super->base, super->nr_cids,
+			       sub->bits, sub->base, sub->nr_cids, CMASK_OP2_SUBSET);
+}
+
+bool scx_cmask_intersects(const struct scx_cmask *a, const struct scx_cmask *b)
+{
+	return cmask_walk_op2((u64 *)a->bits, a->base, a->nr_cids,
+			      b->bits, b->base, b->nr_cids, CMASK_OP2_INTERSECTS);
+}
+
 /**
  * scx_bpf_cid_topo - Copy out per-cid topology info
  * @cid: cid to look up
diff --git a/kernel/sched/ext_cid.h b/kernel/sched/ext_cid.h
index 223ed0e857ec..73ad280c6d94 100644
--- a/kernel/sched/ext_cid.h
+++ b/kernel/sched/ext_cid.h
@@ -53,6 +53,14 @@ extern struct btf_id_set8 scx_kfunc_ids_init;
 
 void scx_cmask_clear(struct scx_cmask *m);
 void scx_cmask_fill(struct scx_cmask *m);
+void scx_cmask_and(struct scx_cmask *dst, const struct scx_cmask *src);
+void scx_cmask_or(struct scx_cmask *dst, const struct scx_cmask *src);
+void scx_cmask_or_racy(struct scx_cmask *dst, const struct scx_cmask *src);
+void scx_cmask_copy(struct scx_cmask *dst, const struct scx_cmask *src);
+void scx_cmask_copy_racy(struct scx_cmask *dst, const struct scx_cmask *src);
+void scx_cmask_andnot(struct scx_cmask *dst, const struct scx_cmask *src);
+bool scx_cmask_subset(const struct scx_cmask *sub, const struct scx_cmask *super);
+bool scx_cmask_intersects(const struct scx_cmask *a, const struct scx_cmask *b);
 s32 scx_cid_init(struct scx_sched *sch);
 int scx_cid_kfunc_init(void);
 void scx_cpumask_to_cmask(const struct cpumask *src, struct scx_cmask *dst);
-- 
2.54.0


^ permalink raw reply related	[flat|nested] 14+ messages in thread

* Re: [PATCH 1/3] sched_ext: Rename scx_cmask.nr_bits to nr_cids
  2026-05-17 18:36 ` [PATCH 1/3] sched_ext: Rename scx_cmask.nr_bits to nr_cids Tejun Heo
@ 2026-05-17 18:43   ` sashiko-bot
  2026-05-17 19:02   ` [PATCH v2 " Tejun Heo
  1 sibling, 0 replies; 14+ messages in thread
From: sashiko-bot @ 2026-05-17 18:43 UTC (permalink / raw)
  To: Tejun Heo; +Cc: sched-ext

Thank you for your contribution! Sashiko AI review found 1 potential issue(s) to consider:
- [Low] The struct field 'nr_bits' in 'struct scx_cmask' was renamed to 'nr_cids', but the function 'cmask_copy_from_kernel()' in 'tools/sched_ext/include/scx/cid.bpf.h' still references 'src->nr_bits' and 'dst->nr_bits'. This will cause a compilation error for BPF programs including this header.
--

commit 10e6d97c74e4139adeb947156f1940e0b2848eb4
Author: Tejun Heo <tj@kernel.org>

sched_ext: Rename scx_cmask.nr_bits to nr_cids

This commit renames the nr_bits field in struct scx_cmask to nr_cids.
This aligns the naming with the sibling struct scx_cid_shard to provide
a consistent API for future cmask patches.

> diff --git a/kernel/sched/ext_types.h b/kernel/sched/ext_types.h
> index ebb8cdf906123..c6c4e3db73111 100644
> --- a/kernel/sched/ext_types.h
> +++ b/kernel/sched/ext_types.h
> @@ -82,21 +82,21 @@ struct scx_cid_topo {
>   */
>  struct scx_cmask {
>  	u32 base;
> -	u32 nr_bits;
> +	u32 nr_cids;
>  	DECLARE_FLEX_ARRAY(u64, bits);
>  };

Since the field is renamed to nr_cids here, does cmask_copy_from_kernel() in
tools/sched_ext/include/scx/cid.bpf.h also need to be updated?

It appears to still reference the old nr_bits field:

tools/sched_ext/include/scx/cid.bpf.h:cmask_copy_from_kernel() {
    ...
    if (bpf_probe_read_kernel(&nr_bits, sizeof(nr_bits), &src->nr_bits)) {
        scx_bpf_error("probe-read cmask->nr_bits failed");
        return;
    }

    if (nr_bits > dst->nr_bits) {
        scx_bpf_error("src cmask nr_bits=%u exceeds dst nr_bits=%u",
                      nr_bits, dst->nr_bits);
        return;
    }
    ...
}

Will this cause a compilation error for BPF programs including this header?

-- 
Sashiko AI review · https://sashiko.dev/#/patchset/20260517183614.1191534-1-tj@kernel.org?part=1

^ permalink raw reply	[flat|nested] 14+ messages in thread

* [PATCH v2 1/3] sched_ext: Rename scx_cmask.nr_bits to nr_cids
  2026-05-17 18:36 ` [PATCH 1/3] sched_ext: Rename scx_cmask.nr_bits to nr_cids Tejun Heo
  2026-05-17 18:43   ` sashiko-bot
@ 2026-05-17 19:02   ` Tejun Heo
  1 sibling, 0 replies; 14+ messages in thread
From: Tejun Heo @ 2026-05-17 19:02 UTC (permalink / raw)
  To: David Vernet, Andrea Righi, Changwoo Min
  Cc: sched-ext, Emil Tsalapatis, linux-kernel, Tejun Heo

struct scx_cmask is a base-windowed bitmap over cid space. Each bit
represents one cid, so the count of active bits is the count of cids. The
sibling struct scx_cid_shard already uses nr_cids. Rename as a prep so the
following patches that grow the cmask API can use the consistent name.

v2: Also rename src->nr_bits / dst->nr_bits in
    cmask_copy_from_kernel(). (sashiko AI)

Signed-off-by: Tejun Heo <tj@kernel.org>
---
 kernel/sched/ext_cid.h                |  8 +--
 kernel/sched/ext_types.h              | 12 ++---
 tools/sched_ext/include/scx/cid.bpf.h | 78 +++++++++++++--------------
 3 files changed, 49 insertions(+), 49 deletions(-)

diff --git a/kernel/sched/ext_cid.h b/kernel/sched/ext_cid.h
index f41d48afb7d1..e1c44a180bb1 100644
--- a/kernel/sched/ext_cid.h
+++ b/kernel/sched/ext_cid.h
@@ -138,7 +138,7 @@ static inline bool scx_is_cid_type(void)
 
 static inline bool __scx_cmask_contains(const struct scx_cmask *m, u32 cid)
 {
-	return likely(cid >= m->base && cid < m->base + m->nr_bits);
+	return likely(cid >= m->base && cid < m->base + m->nr_cids);
 }
 
 /* Word in bits[] covering @cid. @cid must satisfy __scx_cmask_contains(). */
@@ -147,11 +147,11 @@ static inline u64 *__scx_cmask_word(const struct scx_cmask *m, u32 cid)
 	return (u64 *)&m->bits[cid / 64 - m->base / 64];
 }
 
-static inline void scx_cmask_init(struct scx_cmask *m, u32 base, u32 nr_bits)
+static inline void scx_cmask_init(struct scx_cmask *m, u32 base, u32 nr_cids)
 {
 	m->base = base;
-	m->nr_bits = nr_bits;
-	memset(m->bits, 0, SCX_CMASK_NR_WORDS(nr_bits) * sizeof(u64));
+	m->nr_cids = nr_cids;
+	memset(m->bits, 0, SCX_CMASK_NR_WORDS(nr_cids) * sizeof(u64));
 }
 
 static inline void __scx_cmask_set(struct scx_cmask *m, u32 cid)
diff --git a/kernel/sched/ext_types.h b/kernel/sched/ext_types.h
index ebb8cdf90612..c6c4e3db7311 100644
--- a/kernel/sched/ext_types.h
+++ b/kernel/sched/ext_types.h
@@ -67,10 +67,10 @@ struct scx_cid_topo {
  * cmask: variable-length, base-windowed bitmap over cid space
  * -----------------------------------------------------------
  *
- * A cmask covers the cid range [base, base + nr_bits). bits[] is aligned to the
+ * A cmask covers the cid range [base, base + nr_cids). bits[] is aligned to the
  * global 64-cid grid: bits[0] spans [base & ~63, (base & ~63) + 64), so the
  * first (base & 63) bits of bits[0] are head padding and any tail past base +
- * nr_bits is tail padding. Both must stay zero for the lifetime of the mask;
+ * nr_cids is tail padding. Both must stay zero for the lifetime of the mask;
  * all mutating helpers preserve that invariant.
  *
  * Grid alignment means two cmasks always address bits[] against the same global
@@ -82,21 +82,21 @@ struct scx_cid_topo {
  */
 struct scx_cmask {
 	u32 base;
-	u32 nr_bits;
+	u32 nr_cids;
 	DECLARE_FLEX_ARRAY(u64, bits);
 };
 
 /*
- * Number of u64 words of bits[] storage that covers @nr_bits regardless of base
+ * Number of u64 words of bits[] storage that covers @nr_cids regardless of base
  * alignment. The +1 absorbs up to 63 bits of head padding when base is not
  * 64-aligned - always allocating one extra word beats branching on base or
  * splitting the compute.
  */
-#define SCX_CMASK_NR_WORDS(nr_bits)	(((nr_bits) + 63) / 64 + 1)
+#define SCX_CMASK_NR_WORDS(nr_cids)	(((nr_cids) + 63) / 64 + 1)
 
 /*
  * Define an on-stack cmask for up to @cap_bits. @name is a struct scx_cmask *
- * aliasing zero-initialized storage; call scx_cmask_init() to set base/nr_bits.
+ * aliasing zero-initialized storage; call scx_cmask_init() to set base/nr_cids.
  */
 #define SCX_CMASK_DEFINE(name, cap_bits)	\
 	DEFINE_RAW_FLEX(struct scx_cmask, name, bits, SCX_CMASK_NR_WORDS(cap_bits))
diff --git a/tools/sched_ext/include/scx/cid.bpf.h b/tools/sched_ext/include/scx/cid.bpf.h
index 7a867e435670..182fed233abc 100644
--- a/tools/sched_ext/include/scx/cid.bpf.h
+++ b/tools/sched_ext/include/scx/cid.bpf.h
@@ -32,11 +32,11 @@
 #define CMASK_MAX_WORDS 129
 #endif
 
-#define CMASK_NR_WORDS(nr_bits)		(((nr_bits) + 63) / 64 + 1)
+#define CMASK_NR_WORDS(nr_cids)		(((nr_cids) + 63) / 64 + 1)
 
 static __always_inline bool __cmask_contains(const struct scx_cmask __arena *m, u32 cid)
 {
-	return cid >= m->base && cid < m->base + m->nr_bits;
+	return cid >= m->base && cid < m->base + m->nr_cids;
 }
 
 static __always_inline u64 __arena *__cmask_word(const struct scx_cmask __arena *m, u32 cid)
@@ -44,12 +44,12 @@ static __always_inline u64 __arena *__cmask_word(const struct scx_cmask __arena
 	return (u64 __arena *)&m->bits[cid / 64 - m->base / 64];
 }
 
-static __always_inline void cmask_init(struct scx_cmask __arena *m, u32 base, u32 nr_bits)
+static __always_inline void cmask_init(struct scx_cmask __arena *m, u32 base, u32 nr_cids)
 {
-	u32 nr_words = CMASK_NR_WORDS(nr_bits), i;
+	u32 nr_words = CMASK_NR_WORDS(nr_cids), i;
 
 	m->base = base;
-	m->nr_bits = nr_bits;
+	m->nr_cids = nr_cids;
 
 	bpf_for(i, 0, CMASK_MAX_WORDS) {
 		if (i >= nr_words)
@@ -206,7 +206,7 @@ static __always_inline bool __cmask_test_and_clear(struct scx_cmask __arena *m,
 
 static __always_inline void cmask_zero(struct scx_cmask __arena *m)
 {
-	u32 nr_words = CMASK_NR_WORDS(m->nr_bits), i;
+	u32 nr_words = CMASK_NR_WORDS(m->nr_cids), i;
 
 	bpf_for(i, 0, CMASK_MAX_WORDS) {
 		if (i >= nr_words)
@@ -250,8 +250,8 @@ static __always_inline void cmask_op_word(struct scx_cmask __arena *dst,
 static __always_inline void cmask_op(struct scx_cmask __arena *dst,
 				     const struct scx_cmask __arena *src, int op)
 {
-	u32 d_end = dst->base + dst->nr_bits;
-	u32 s_end = src->base + src->nr_bits;
+	u32 d_end = dst->base + dst->nr_cids;
+	u32 s_end = src->base + src->nr_cids;
 	u32 lo = dst->base > src->base ? dst->base : src->base;
 	u32 hi = d_end < s_end ? d_end : s_end;
 	u32 d_base = dst->base / 64;
@@ -286,8 +286,8 @@ static __always_inline void cmask_op(struct scx_cmask __arena *dst,
 
 /*
  * cmask_and/or/copy only modify @dst bits that lie in the intersection of
- * [@dst->base, @dst->base + @dst->nr_bits) and [@src->base,
- * @src->base + @src->nr_bits). Bits in @dst outside that window
+ * [@dst->base, @dst->base + @dst->nr_cids) and [@src->base,
+ * @src->base + @src->nr_cids). Bits in @dst outside that window
  * keep their prior values - in particular, cmask_copy() does NOT zero @dst
  * bits that lie outside @src's range.
  */
@@ -325,9 +325,9 @@ static __always_inline bool cmask_equal(const struct scx_cmask __arena *a,
 {
 	u32 nr_words, i;
 
-	if (a->base != b->base || a->nr_bits != b->nr_bits)
+	if (a->base != b->base || a->nr_cids != b->nr_cids)
 		return false;
-	nr_words = CMASK_NR_WORDS(a->nr_bits);
+	nr_words = CMASK_NR_WORDS(a->nr_cids);
 
 	bpf_for(i, 0, CMASK_MAX_WORDS) {
 		if (i >= nr_words)
@@ -345,8 +345,8 @@ static __always_inline bool cmask_equal(const struct scx_cmask __arena *a,
 static __always_inline bool cmask_subset(const struct scx_cmask __arena *a,
 					 const struct scx_cmask __arena *b)
 {
-	u32 a_end = a->base + a->nr_bits;
-	u32 b_end = b->base + b->nr_bits;
+	u32 a_end = a->base + a->nr_cids;
+	u32 b_end = b->base + b->nr_cids;
 	u32 a_wbase = a->base / 64;
 	u32 b_wbase = b->base / 64;
 	u32 nr_words, i;
@@ -355,7 +355,7 @@ static __always_inline bool cmask_subset(const struct scx_cmask __arena *a,
 	if (a->base < b->base || a_end > b_end)
 		return false;
 
-	nr_words = CMASK_NR_WORDS(a->nr_bits);
+	nr_words = CMASK_NR_WORDS(a->nr_cids);
 	bpf_for(i, 0, CMASK_MAX_WORDS) {
 		u32 wi_b;
 
@@ -373,13 +373,13 @@ static __always_inline bool cmask_subset(const struct scx_cmask __arena *a,
  * @m: cmask to search
  * @cid: starting cid (clamped to @m->base if below)
  *
- * Returns the smallest set cid in [@cid, @m->base + @m->nr_bits), or
- * @m->base + @m->nr_bits if none (the out-of-range sentinel matches the
+ * Returns the smallest set cid in [@cid, @m->base + @m->nr_cids), or
+ * @m->base + @m->nr_cids if none (the out-of-range sentinel matches the
  * termination condition used by cmask_for_each()).
  */
 static __always_inline u32 cmask_next_set(const struct scx_cmask __arena *m, u32 cid)
 {
-	u32 end = m->base + m->nr_bits;
+	u32 end = m->base + m->nr_cids;
 	u32 base = m->base / 64;
 	u32 last_wi = (end - 1) / 64 - base;
 	u32 start_wi, start_bit, i;
@@ -421,17 +421,17 @@ static __always_inline u32 cmask_first_set(const struct scx_cmask __arena *m)
 
 #define cmask_for_each(cid, m)							\
 	for ((cid) = cmask_first_set(m);					\
-	     (cid) < (m)->base + (m)->nr_bits;					\
+	     (cid) < (m)->base + (m)->nr_cids;					\
 	     (cid) = cmask_next_set((m), (cid) + 1))
 
 /*
- * Population count over [base, base + nr_bits). Padding bits in the head/tail
+ * Population count over [base, base + nr_cids). Padding bits in the head/tail
  * words are guaranteed zero by the mutating helpers, so a flat popcount over
  * all words is correct.
  */
 static __always_inline u32 cmask_weight(const struct scx_cmask __arena *m)
 {
-	u32 nr_words = CMASK_NR_WORDS(m->nr_bits), i;
+	u32 nr_words = CMASK_NR_WORDS(m->nr_cids), i;
 	u32 count = 0;
 
 	bpf_for(i, 0, CMASK_MAX_WORDS) {
@@ -449,8 +449,8 @@ static __always_inline u32 cmask_weight(const struct scx_cmask __arena *m)
 static __always_inline bool cmask_intersects(const struct scx_cmask __arena *a,
 					     const struct scx_cmask __arena *b)
 {
-	u32 a_end = a->base + a->nr_bits;
-	u32 b_end = b->base + b->nr_bits;
+	u32 a_end = a->base + a->nr_cids;
+	u32 b_end = b->base + b->nr_cids;
 	u32 lo = a->base > b->base ? a->base : b->base;
 	u32 hi = a_end < b_end ? a_end : b_end;
 	u32 a_base = a->base / 64;
@@ -489,7 +489,7 @@ static __always_inline bool cmask_intersects(const struct scx_cmask __arena *a,
 
 /*
  * Find the next cid set in both @a and @b at or after @start, bounded by the
- * intersection of the two ranges. Return a->base + a->nr_bits if none found.
+ * intersection of the two ranges. Return a->base + a->nr_cids if none found.
  *
  * Building block for cmask_next_and_set_wrap(). Callers that want a bounded
  * scan without wrap call this directly.
@@ -498,8 +498,8 @@ static __always_inline u32 cmask_next_and_set(const struct scx_cmask __arena *a,
 					      const struct scx_cmask __arena *b,
 					      u32 start)
 {
-	u32 a_end = a->base + a->nr_bits;
-	u32 b_end = b->base + b->nr_bits;
+	u32 a_end = a->base + a->nr_cids;
+	u32 b_end = b->base + b->nr_cids;
 	u32 a_wbase = a->base / 64;
 	u32 b_wbase = b->base / 64;
 	u32 lo = a->base > b->base ? a->base : b->base;
@@ -541,15 +541,15 @@ static __always_inline u32 cmask_next_and_set(const struct scx_cmask __arena *a,
 
 /*
  * Find the next set cid in @m at or after @start, wrapping to @m->base if no
- * set bit is found in [start, m->base + m->nr_bits). Return m->base +
- * m->nr_bits if @m is empty.
+ * set bit is found in [start, m->base + m->nr_cids). Return m->base +
+ * m->nr_cids if @m is empty.
  *
  * Callers do round-robin distribution by passing (last_cid + 1) as @start.
  */
 static __always_inline u32 cmask_next_set_wrap(const struct scx_cmask __arena *m,
 					       u32 start)
 {
-	u32 end = m->base + m->nr_bits;
+	u32 end = m->base + m->nr_cids;
 	u32 found;
 
 	found = cmask_next_set(m, start);
@@ -562,7 +562,7 @@ static __always_inline u32 cmask_next_set_wrap(const struct scx_cmask __arena *m
 
 /*
  * Find the next cid set in both @a and @b at or after @start, wrapping to
- * @a->base if none found in the forward half. Return a->base + a->nr_bits
+ * @a->base if none found in the forward half. Return a->base + a->nr_cids
  * if the intersection is empty.
  *
  * Callers do round-robin distribution by passing (last_cid + 1) as @start.
@@ -571,7 +571,7 @@ static __always_inline u32 cmask_next_and_set_wrap(const struct scx_cmask __aren
 						   const struct scx_cmask __arena *b,
 						   u32 start)
 {
-	u32 a_end = a->base + a->nr_bits;
+	u32 a_end = a->base + a->nr_cids;
 	u32 found;
 
 	found = cmask_next_and_set(a, b, start);
@@ -585,7 +585,7 @@ static __always_inline u32 cmask_next_and_set_wrap(const struct scx_cmask __aren
 /**
  * cmask_from_cpumask - translate a kernel cpumask to a cid-space cmask
  * @m: cmask to fill. Zeroed first; only bits within [@m->base, @m->base +
- *     @m->nr_bits) are updated - cpus mapping to cids outside that range
+ *     @m->nr_cids) are updated - cpus mapping to cids outside that range
  *     are ignored.
  * @cpumask: kernel cpumask to translate
  *
@@ -622,7 +622,7 @@ static __always_inline void cmask_from_cpumask(struct scx_cmask __arena *m,
 static __always_inline void cmask_copy_from_kernel(struct scx_cmask __arena *dst,
 						   const struct scx_cmask *src)
 {
-	u32 base = 0, nr_bits = 0, nr_words, wi;
+	u32 base = 0, nr_cids = 0, nr_words, wi;
 
 	if (dst->base != 0) {
 		scx_bpf_error("cmask_copy_from_kernel requires dst->base == 0");
@@ -638,18 +638,18 @@ static __always_inline void cmask_copy_from_kernel(struct scx_cmask __arena *dst
 		return;
 	}
 
-	if (bpf_probe_read_kernel(&nr_bits, sizeof(nr_bits), &src->nr_bits)) {
-		scx_bpf_error("probe-read cmask->nr_bits failed");
+	if (bpf_probe_read_kernel(&nr_cids, sizeof(nr_cids), &src->nr_cids)) {
+		scx_bpf_error("probe-read cmask->nr_cids failed");
 		return;
 	}
 
-	if (nr_bits > dst->nr_bits) {
-		scx_bpf_error("src cmask nr_bits=%u exceeds dst nr_bits=%u",
-			      nr_bits, dst->nr_bits);
+	if (nr_cids > dst->nr_cids) {
+		scx_bpf_error("src cmask nr_cids=%u exceeds dst nr_cids=%u",
+			      nr_cids, dst->nr_cids);
 		return;
 	}
 
-	nr_words = CMASK_NR_WORDS(nr_bits);
+	nr_words = CMASK_NR_WORDS(nr_cids);
 	cmask_zero(dst);
 	bpf_for(wi, 0, CMASK_MAX_WORDS) {
 		u64 word = 0;
-- 
2.54.0


^ permalink raw reply related	[flat|nested] 14+ messages in thread

* Re: [PATCH 2/3] sched_ext: Track bits[] storage size in struct scx_cmask
  2026-05-17 18:36 ` [PATCH 2/3] sched_ext: Track bits[] storage size in struct scx_cmask Tejun Heo
@ 2026-05-17 19:14   ` sashiko-bot
  2026-05-17 19:29   ` [PATCH v2 " Tejun Heo
  1 sibling, 0 replies; 14+ messages in thread
From: sashiko-bot @ 2026-05-17 19:14 UTC (permalink / raw)
  To: Tejun Heo; +Cc: sched-ext

Thank you for your contribution! Sashiko AI review found 2 potential issue(s) to consider:
- [High] Uninitialized per-CPU cmask scratch buffer silently breaks CPU affinity tracking
- [Medium] Missing bounds validation in BPF helper `__cmask_init()` permits arena out-of-bounds access
--

commit ef0793b90f8713b1c92aecf6ec1f064d2df562a5
Author: Tejun Heo <tj@kernel.org>

sched_ext: Track bits[] storage size in struct scx_cmask

This patch adds tracking for the allocation size of bits[] in struct
scx_cmask to allow validation of active range reshapes. It splits the
bit-range API, introduces scx_cmask_clear() and scx_cmask_fill(), and
updates the BPF mirror functions.

> diff --git a/kernel/sched/ext_cid.c b/kernel/sched/ext_cid.c
> index bdd8ef8eae3dc..f50319c5c65e1 100644
> --- a/kernel/sched/ext_cid.c
> +++ b/kernel/sched/ext_cid.c

[ ... ]

> @@ -274,10 +316,9 @@ void scx_cpumask_to_cmask(const struct cpumask *src, struct scx_cmask *dst)
>   * Initialize @dst to cover the full cid space [0, num_possible_cpus()) and
>   * set the bit for each cid whose cpu is in @src.
> + * Clear @dst's active range and set the bit for each cid whose cpu is in
> + * @src and lies within that range. Out-of-range cids are silently ignored.
>   */
>  void scx_cpumask_to_cmask(const struct cpumask *src, struct scx_cmask *dst)
>  {
>  	s32 cpu;
>  
> -	scx_cmask_init(dst, 0, num_possible_cpus());
> +	scx_cmask_clear(dst);

Since dst (scx_set_cmask_scratch) is zero-initialized via __alloc_percpu(),
does it ever get its nr_cids explicitly initialized now that the call to
scx_cmask_init() has been removed?

If m->nr_cids remains 0, scx_cmask_clear() will return immediately. The
subsequent loop calling __scx_cmask_set() would then ignore all CPUs because
the bounds check in __scx_cmask_contains() fails for a 0-length range.

Could this result in BPF schedulers receiving an empty cmask in their
ops_cid.set_cmask() callback, breaking task affinity tracking?

>  	for_each_cpu(cpu, src) {
>  		s32 cid = __scx_cpu_to_cid(cpu);

[ ... ]

> diff --git a/tools/sched_ext/include/scx/cid.bpf.h b/tools/sched_ext/include/scx/cid.bpf.h
> index b9dcc14870d3a..211f4077f43ac 100644
> --- a/tools/sched_ext/include/scx/cid.bpf.h
> +++ b/tools/sched_ext/include/scx/cid.bpf.h
> @@ -44,20 +44,71 @@ static __always_inline u64 __arena *__cmask_word(const struct scx_cmask __arena
>  	return (u64 __arena *)&m->bits[cid / 64 - m->base / 64];
>  }
>  
> -static __always_inline void cmask_init(struct scx_cmask __arena *m, u32 base, u32 nr_cids)
> +/**
> + * __cmask_init - Initialize @m with explicit storage capacity
> + * @m: cmask to initialize
> + * @base: first cid of the active range
> + * @nr_cids: number of cids in the active range
> + * @alloc_cids: storage capacity in cids, at least @nr_cids
> + *
> + * Use when storage is sized larger than the initial active range. All of
> + * bits[] is zeroed.
> + */
> +static __always_inline void __cmask_init(struct scx_cmask __arena *m, u32 base,
> +					 u32 nr_cids, u32 alloc_cids)
>  {
> -	u32 nr_words = CMASK_NR_WORDS(nr_cids), i;
> +	u32 alloc_words = CMASK_NR_WORDS(alloc_cids), i;
>  
>  	m->base = base;
>  	m->nr_cids = nr_cids;

Should this helper validate that nr_cids does not exceed alloc_cids?

The C implementation (__scx_cmask_init) safely clamps nr_cids to alloc_cids.
Without a similar check here or a call to scx_bpf_error(), passing an
oversized nr_cids allows __cmask_contains() to pass bounds checks for bits
beyond alloc_words.

Could this allow BPF programs to bypass intended constraints and corrupt
their own arena structures during cmask_test() or __cmask_word() calls?

> +	m->alloc_words = alloc_words;
>  
>  	bpf_for(i, 0, CMASK_MAX_WORDS) {

-- 
Sashiko AI review · https://sashiko.dev/#/patchset/20260517183614.1191534-1-tj@kernel.org?part=2

^ permalink raw reply	[flat|nested] 14+ messages in thread

* [PATCH v2 2/3] sched_ext: Track bits[] storage size in struct scx_cmask
  2026-05-17 18:36 ` [PATCH 2/3] sched_ext: Track bits[] storage size in struct scx_cmask Tejun Heo
  2026-05-17 19:14   ` sashiko-bot
@ 2026-05-17 19:29   ` Tejun Heo
  2026-05-18 22:11     ` Andrea Righi
  1 sibling, 1 reply; 14+ messages in thread
From: Tejun Heo @ 2026-05-17 19:29 UTC (permalink / raw)
  To: David Vernet, Andrea Righi, Changwoo Min
  Cc: sched-ext, Emil Tsalapatis, linux-kernel, Tejun Heo

scx_cmask carries @base and @nr_cids but not the bits[] allocation size, so
helpers reshaping the active range have no way to check it fits and later
kfuncs taking caller-provided storage can't validate it.

Add @alloc_words (u64 word count) annotated with __counted_by, and split the
bit-range API into three helpers:

- SCX_CMASK_DEFINE() / __SCX_CMASK_DEFINE() define an on-stack cmask, the
  latter taking an explicit capacity for oversized storage.
  SCX_CMASK_DEFINE_SHARD() is a thin wrapper that always reserves
  SCX_CID_SHARD_MAX_CPUS bits of storage.

- scx_cmask_init() / __scx_cmask_init() initialize a cmask, with the same
  tight-vs-explicit split.

- scx_cmask_reframe() reshapes the active range without resizing storage.

The BPF mirror (cmask_init / __cmask_init / cmask_reframe) gets the same
shape.

Add scx_cmask_clear() and scx_cmask_fill() to zero and set the
active-range bits respectively. scx_cpumask_to_cmask() uses
scx_cmask_clear(); scx_cmask_init() would otherwise re-write @alloc_words
on every call.

A later patch uses @alloc_words in scx_cmask_ref_shard() to refuse output
storage that can't hold the requested shard.

v2: Init per-CPU scx_set_cmask_scratch (was zero-init, emitted empty
    cmasks). Add nr_cids/alloc_cids check in BPF __cmask_init().
    (sashiko AI)

Signed-off-by: Tejun Heo <tj@kernel.org>
---
 kernel/sched/ext_cid.c                | 52 ++++++++++++++++++++--
 kernel/sched/ext_cid.h                | 57 +++++++++++++++++++++++-
 kernel/sched/ext_types.h              | 56 +++++++++++++++++++----
 tools/sched_ext/include/scx/cid.bpf.h | 64 +++++++++++++++++++++++++--
 4 files changed, 213 insertions(+), 16 deletions(-)

diff --git a/kernel/sched/ext_cid.c b/kernel/sched/ext_cid.c
index bdd8ef8eae3d..44dd47a87709 100644
--- a/kernel/sched/ext_cid.c
+++ b/kernel/sched/ext_cid.c
@@ -55,6 +55,7 @@ static s32 scx_cid_arrays_alloc(void)
 	s16 *cid_to_cpu, *cpu_to_cid;
 	struct scx_cid_topo *cid_topo;
 	struct scx_cmask __percpu *set_cmask_scratch;
+	s32 cpu;
 
 	if (scx_cid_to_cpu_tbl)
 		return 0;
@@ -77,6 +78,9 @@ static s32 scx_cid_arrays_alloc(void)
 	WRITE_ONCE(scx_cid_to_cpu_tbl, cid_to_cpu);
 	WRITE_ONCE(scx_cpu_to_cid_tbl, cpu_to_cid);
 	WRITE_ONCE(scx_cid_topo, cid_topo);
+	for_each_possible_cpu(cpu)
+		scx_cmask_init(per_cpu_ptr(set_cmask_scratch, cpu),
+			       0, npossible);
 	WRITE_ONCE(scx_set_cmask_scratch, set_cmask_scratch);
 	return 0;
 }
@@ -222,19 +226,61 @@ s32 scx_cid_init(struct scx_sched *sch)
 	return 0;
 }
 
+/**
+ * scx_cmask_clear - Zero every bit in @m's active range
+ * @m: cmask to clear
+ *
+ * Storage past the active range is left as is.
+ */
+void scx_cmask_clear(struct scx_cmask *m)
+{
+	u32 nr_words;
+
+	if (!m->nr_cids)
+		return;
+	nr_words = (m->base + m->nr_cids - 1) / 64 - m->base / 64 + 1;
+	memset(m->bits, 0, nr_words * sizeof(u64));
+}
+
+/**
+ * scx_cmask_fill - Set every bit in @m's active range
+ * @m: cmask to fill
+ *
+ * Counterpart to scx_cmask_clear(). Storage past the active range is left as is.
+ */
+void scx_cmask_fill(struct scx_cmask *m)
+{
+	u32 nr_words, head_bits, tail_bits;
+
+	if (!m->nr_cids)
+		return;
+	nr_words = (m->base + m->nr_cids - 1) / 64 - m->base / 64 + 1;
+	memset(m->bits, 0xff, nr_words * sizeof(u64));
+
+	/* clear word-0 bits below base */
+	head_bits = m->base & 63;
+	if (head_bits)
+		m->bits[0] &= ~((1ULL << head_bits) - 1);
+
+	/* clear last-word bits at or past base + nr_cids */
+	tail_bits = (m->base + m->nr_cids) & 63;
+	if (tail_bits)
+		m->bits[nr_words - 1] &= (1ULL << tail_bits) - 1;
+}
+
 /**
  * scx_cpumask_to_cmask - Translate a kernel cpumask into a cmask
  * @src: source cpumask
  * @dst: cmask to write
  *
- * Initialize @dst to cover the full cid space [0, num_possible_cpus()) and
- * set the bit for each cid whose cpu is in @src.
+ * Clear @dst's active range and set the bit for each cid whose cpu is in
+ * @src and lies within that range. Out-of-range cids are silently ignored.
  */
 void scx_cpumask_to_cmask(const struct cpumask *src, struct scx_cmask *dst)
 {
 	s32 cpu;
 
-	scx_cmask_init(dst, 0, num_possible_cpus());
+	scx_cmask_clear(dst);
 	for_each_cpu(cpu, src) {
 		s32 cid = __scx_cpu_to_cid(cpu);
 
diff --git a/kernel/sched/ext_cid.h b/kernel/sched/ext_cid.h
index e1c44a180bb1..223ed0e857ec 100644
--- a/kernel/sched/ext_cid.h
+++ b/kernel/sched/ext_cid.h
@@ -51,6 +51,8 @@ extern s16 *scx_cpu_to_cid_tbl;
 extern struct scx_cid_topo *scx_cid_topo;
 extern struct btf_id_set8 scx_kfunc_ids_init;
 
+void scx_cmask_clear(struct scx_cmask *m);
+void scx_cmask_fill(struct scx_cmask *m);
 s32 scx_cid_init(struct scx_sched *sch);
 int scx_cid_kfunc_init(void);
 void scx_cpumask_to_cmask(const struct cpumask *src, struct scx_cmask *dst);
@@ -147,11 +149,64 @@ static inline u64 *__scx_cmask_word(const struct scx_cmask *m, u32 cid)
 	return (u64 *)&m->bits[cid / 64 - m->base / 64];
 }
 
+/**
+ * __scx_cmask_init - Initialize @m with explicit storage capacity
+ * @m: cmask to initialize
+ * @base: first cid of the active range
+ * @nr_cids: number of cids in the active range
+ * @alloc_cids: storage capacity in cids, at least @nr_cids
+ *
+ * Use when storage is sized larger than the initial active range. All of
+ * bits[] is zeroed.
+ */
+static inline void __scx_cmask_init(struct scx_cmask *m, u32 base, u32 nr_cids,
+				    u32 alloc_cids)
+{
+	if (WARN_ON_ONCE(alloc_cids < nr_cids))
+		nr_cids = alloc_cids;
+
+	m->base = base;
+	m->nr_cids = nr_cids;
+	m->alloc_words = SCX_CMASK_NR_WORDS(alloc_cids);
+	memset(m->bits, 0, m->alloc_words * sizeof(u64));
+}
+
+/**
+ * scx_cmask_init - Initialize @m on tight storage
+ * @m: cmask to initialize
+ * @base: first cid of the active range
+ * @nr_cids: number of cids in the active range
+ *
+ * All of bits[] is zeroed.
+ */
 static inline void scx_cmask_init(struct scx_cmask *m, u32 base, u32 nr_cids)
 {
+	__scx_cmask_init(m, base, nr_cids, nr_cids);
+}
+
+/**
+ * scx_cmask_reframe - Reshape @m's active range without resizing storage
+ * @m: cmask to reframe
+ * @base: new active range base
+ * @nr_cids: new active range length, must fit within @m->alloc_words
+ *
+ * Body bits within the new range become garbage - only the head and tail
+ * words are zeroed to keep the padding invariant.
+ */
+static inline void scx_cmask_reframe(struct scx_cmask *m, u32 base, u32 nr_cids)
+{
+	if (WARN_ON_ONCE(SCX_CMASK_NR_WORDS(nr_cids) > m->alloc_words))
+		return;
+
+	if (nr_cids) {
+		u32 last_word = ((base & 63) + nr_cids - 1) / 64;
+
+		m->bits[0] = 0;
+		m->bits[last_word] = 0;
+	}
+
 	m->base = base;
 	m->nr_cids = nr_cids;
-	memset(m->bits, 0, SCX_CMASK_NR_WORDS(nr_cids) * sizeof(u64));
 }
 
 static inline void __scx_cmask_set(struct scx_cmask *m, u32 cid)
diff --git a/kernel/sched/ext_types.h b/kernel/sched/ext_types.h
index c6c4e3db7311..0c318a359849 100644
--- a/kernel/sched/ext_types.h
+++ b/kernel/sched/ext_types.h
@@ -69,9 +69,10 @@ struct scx_cid_topo {
  *
  * A cmask covers the cid range [base, base + nr_cids). bits[] is aligned to the
  * global 64-cid grid: bits[0] spans [base & ~63, (base & ~63) + 64), so the
- * first (base & 63) bits of bits[0] are head padding and any tail past base +
- * nr_cids is tail padding. Both must stay zero for the lifetime of the mask;
- * all mutating helpers preserve that invariant.
+ * first (base & 63) bits of bits[0] are head padding and the trailing bits of
+ * the last active word past base + nr_cids are tail padding. Both stay zero;
+ * all mutating helpers preserve that. Words past the last active word are not
+ * read by any helper and have no constraint.
  *
  * Grid alignment means two cmasks always address bits[] against the same global
  * 64-cid windows, so cross-cmask word ops (AND, OR, ...) reduce to
@@ -83,7 +84,8 @@ struct scx_cid_topo {
 struct scx_cmask {
 	u32 base;
 	u32 nr_cids;
-	DECLARE_FLEX_ARRAY(u64, bits);
+	u32 alloc_words;
+	u64 bits[] __counted_by(alloc_words);
 };
 
 /*
@@ -94,11 +96,47 @@ struct scx_cmask {
  */
 #define SCX_CMASK_NR_WORDS(nr_cids)	(((nr_cids) + 63) / 64 + 1)
 
-/*
- * Define an on-stack cmask for up to @cap_bits. @name is a struct scx_cmask *
- * aliasing zero-initialized storage; call scx_cmask_init() to set base/nr_cids.
+/**
+ * __SCX_CMASK_DEFINE - Define an on-stack cmask with explicit storage capacity
+ * @NAME: variable name to define
+ * @BASE: first cid of the active range
+ * @NR_CIDS: active range length
+ * @ALLOC_CIDS: storage capacity in cids, at least @NR_CIDS
+ *
+ * @NAME aliases zero-initialized storage with the active range set to
+ * [BASE, BASE + NR_CIDS). Use scx_cmask_reframe() to reshape later, up to
+ * @ALLOC_CIDS.
+ */
+#define __SCX_CMASK_DEFINE(NAME, BASE, NR_CIDS, ALLOC_CIDS)			\
+	_DEFINE_FLEX(struct scx_cmask, NAME, bits, SCX_CMASK_NR_WORDS(ALLOC_CIDS), \
+		     = { .base = (BASE),					\
+			 .nr_cids = (NR_CIDS),					\
+			 .alloc_words = SCX_CMASK_NR_WORDS(ALLOC_CIDS) })
+
+/**
+ * SCX_CMASK_DEFINE - Define an on-stack cmask on tight storage
+ * @NAME: variable name to define
+ * @BASE: first cid of the active range
+ * @NR_CIDS: active range length, also storage capacity
+ *
+ * @NAME aliases zero-initialized storage with the active range and storage
+ * both [BASE, BASE + NR_CIDS).
+ */
+#define SCX_CMASK_DEFINE(NAME, BASE, NR_CIDS)					\
+	__SCX_CMASK_DEFINE(NAME, BASE, NR_CIDS, NR_CIDS)
+
+/**
+ * SCX_CMASK_DEFINE_SHARD - Define an on-stack cmask sized to one shard
+ * @NAME: variable name to define
+ * @BASE: first cid of the active range
+ * @NR_CIDS: active range length, must be <= SCX_CID_SHARD_MAX_CPUS
+ *
+ * Storage is fixed at SCX_CID_SHARD_MAX_CPUS, active range framed by
+ * (BASE, NR_CIDS). Passing NR_CIDS > SCX_CID_SHARD_MAX_CPUS leaves the
+ * cmask claiming more bits than storage holds and subsequent cmask
+ * operations will overrun.
  */
-#define SCX_CMASK_DEFINE(name, cap_bits)	\
-	DEFINE_RAW_FLEX(struct scx_cmask, name, bits, SCX_CMASK_NR_WORDS(cap_bits))
+#define SCX_CMASK_DEFINE_SHARD(NAME, BASE, NR_CIDS)				\
+	__SCX_CMASK_DEFINE(NAME, BASE, NR_CIDS, SCX_CID_SHARD_MAX_CPUS)
 
 #endif /* _KERNEL_SCHED_EXT_TYPES_H */
diff --git a/tools/sched_ext/include/scx/cid.bpf.h b/tools/sched_ext/include/scx/cid.bpf.h
index 182fed233abc..257d8bdca966 100644
--- a/tools/sched_ext/include/scx/cid.bpf.h
+++ b/tools/sched_ext/include/scx/cid.bpf.h
@@ -44,20 +44,78 @@ static __always_inline u64 __arena *__cmask_word(const struct scx_cmask __arena
 	return (u64 __arena *)&m->bits[cid / 64 - m->base / 64];
 }
 
-static __always_inline void cmask_init(struct scx_cmask __arena *m, u32 base, u32 nr_cids)
+/**
+ * __cmask_init - Initialize @m with explicit storage capacity
+ * @m: cmask to initialize
+ * @base: first cid of the active range
+ * @nr_cids: number of cids in the active range
+ * @alloc_cids: storage capacity in cids, at least @nr_cids
+ *
+ * Use when storage is sized larger than the initial active range. All of
+ * bits[] is zeroed.
+ */
+static __always_inline void __cmask_init(struct scx_cmask __arena *m, u32 base,
+					 u32 nr_cids, u32 alloc_cids)
 {
-	u32 nr_words = CMASK_NR_WORDS(nr_cids), i;
+	u32 alloc_words, i;
+
+	if (unlikely(nr_cids > alloc_cids)) {
+		scx_bpf_error("__cmask_init: nr_cids=%u exceeds alloc_cids=%u",
+			      nr_cids, alloc_cids);
+		return;
+	}
+	alloc_words = CMASK_NR_WORDS(alloc_cids);
 
 	m->base = base;
 	m->nr_cids = nr_cids;
+	m->alloc_words = alloc_words;
 
 	bpf_for(i, 0, CMASK_MAX_WORDS) {
-		if (i >= nr_words)
+		if (i >= alloc_words)
 			break;
 		m->bits[i] = 0;
 	}
 }
 
+/**
+ * cmask_init - Initialize @m on tight storage
+ * @m: cmask to initialize
+ * @base: first cid of the active range
+ * @nr_cids: number of cids in the active range
+ *
+ * All of bits[] is zeroed.
+ */
+static __always_inline void cmask_init(struct scx_cmask __arena *m, u32 base, u32 nr_cids)
+{
+	__cmask_init(m, base, nr_cids, nr_cids);
+}
+
+/**
+ * cmask_reframe - Reshape @m's active range without resizing storage
+ * @m: cmask to reframe
+ * @base: new active range base
+ * @nr_cids: new active range length, must fit within @m->alloc_words
+ *
+ * Body bits within the new range become garbage - only the head and tail
+ * words are zeroed to keep the padding invariant.
+ */
+static __always_inline void cmask_reframe(struct scx_cmask __arena *m, u32 base, u32 nr_cids)
+{
+	if (CMASK_NR_WORDS(nr_cids) > m->alloc_words) {
+		scx_bpf_error("cmask_reframe: nr_cids=%u exceeds alloc_words=%u",
+			      nr_cids, m->alloc_words);
+		return;
+	}
+	if (nr_cids) {
+		u32 last_word = ((base & 63) + nr_cids - 1) / 64;
+
+		m->bits[0] = 0;
+		m->bits[last_word] = 0;
+	}
+	m->base = base;
+	m->nr_cids = nr_cids;
+}
+
 static __always_inline bool cmask_test(const struct scx_cmask __arena *m, u32 cid)
 {
 	if (!__cmask_contains(m, cid))
-- 
2.54.0


^ permalink raw reply related	[flat|nested] 14+ messages in thread

* Re: [PATCH v2 2/3] sched_ext: Track bits[] storage size in struct scx_cmask
  2026-05-17 19:29   ` [PATCH v2 " Tejun Heo
@ 2026-05-18 22:11     ` Andrea Righi
  2026-05-18 22:53       ` Tejun Heo
  0 siblings, 1 reply; 14+ messages in thread
From: Andrea Righi @ 2026-05-18 22:11 UTC (permalink / raw)
  To: Tejun Heo
  Cc: David Vernet, Changwoo Min, sched-ext, Emil Tsalapatis,
	linux-kernel

Hi Tejun,

On Sun, May 17, 2026 at 09:29:30AM -1000, Tejun Heo wrote:
> scx_cmask carries @base and @nr_cids but not the bits[] allocation size, so
> helpers reshaping the active range have no way to check it fits and later
> kfuncs taking caller-provided storage can't validate it.
> 
> Add @alloc_words (u64 word count) annotated with __counted_by, and split the
> bit-range API into three helpers:
> 
> - SCX_CMASK_DEFINE() / __SCX_CMASK_DEFINE() define an on-stack cmask, the
>   latter taking an explicit capacity for oversized storage.
>   SCX_CMASK_DEFINE_SHARD() is a thin wrapper that always reserves
>   SCX_CID_SHARD_MAX_CPUS bits of storage.
> 
> - scx_cmask_init() / __scx_cmask_init() initialize a cmask, with the same
>   tight-vs-explicit split.
> 
> - scx_cmask_reframe() reshapes the active range without resizing storage.
> 
> The BPF mirror (cmask_init / __cmask_init / cmask_reframe) gets the same
> shape.
> 
> Add scx_cmask_clear() and scx_cmask_fill() to zero and set the
> active-range bits respectively. scx_cpumask_to_cmask() uses
> scx_cmask_clear(); scx_cmask_init() would otherwise re-write @alloc_words
> on every call.
> 
> A later patch uses @alloc_words in scx_cmask_ref_shard() to refuse output
> storage that can't hold the requested shard.
> 
> v2: Init per-CPU scx_set_cmask_scratch (was zero-init, emitted empty
>     cmasks). Add nr_cids/alloc_cids check in BPF __cmask_init().
>     (sashiko AI)
> 
> Signed-off-by: Tejun Heo <tj@kernel.org>
> ---

...

> +/**
> + * scx_cmask_reframe - Reshape @m's active range without resizing storage
> + * @m: cmask to reframe
> + * @base: new active range base
> + * @nr_cids: new active range length, must fit within @m->alloc_words
> + *
> + * Body bits within the new range become garbage - only the head and tail
> + * words are zeroed to keep the padding invariant.
> + */
> +static inline void scx_cmask_reframe(struct scx_cmask *m, u32 base, u32 nr_cids)
> +{
> +	if (WARN_ON_ONCE(SCX_CMASK_NR_WORDS(nr_cids) > m->alloc_words))
> +		return;

Considering that:

 #define SCX_CMASK_NR_WORDS(nr_cids)    (((nr_cids) + 63) / 64 + 1)

If we pass nr_cids == UINT_MAX here, we have:

 CMASK_NR_WORDS(UINT_MAX) = (UINT_MAX + 63)/64 + 1 = 62/64 + 1 = 1 (wraps)

Should we simply reject if it's greater than a certain reasonable upper bound?

Thanks,
-Andrea

> +
> +	if (nr_cids) {
> +		u32 last_word = ((base & 63) + nr_cids - 1) / 64;
> +
> +		m->bits[0] = 0;
> +		m->bits[last_word] = 0;
> +	}
> +
>  	m->base = base;
>  	m->nr_cids = nr_cids;
> -	memset(m->bits, 0, SCX_CMASK_NR_WORDS(nr_cids) * sizeof(u64));
>  }

^ permalink raw reply	[flat|nested] 14+ messages in thread

* Re: [PATCH v2 2/3] sched_ext: Track bits[] storage size in struct scx_cmask
  2026-05-18 22:11     ` Andrea Righi
@ 2026-05-18 22:53       ` Tejun Heo
  2026-05-19  5:59         ` Andrea Righi
  0 siblings, 1 reply; 14+ messages in thread
From: Tejun Heo @ 2026-05-18 22:53 UTC (permalink / raw)
  To: Andrea Righi
  Cc: David Vernet, Changwoo Min, sched-ext, Emil Tsalapatis,
	linux-kernel

On Tue, May 19, 2026 at 12:11:35AM +0200, Andrea Righi wrote:
> > +/**
> > + * scx_cmask_reframe - Reshape @m's active range without resizing storage
> > + * @m: cmask to reframe
> > + * @base: new active range base
> > + * @nr_cids: new active range length, must fit within @m->alloc_words
> > + *
> > + * Body bits within the new range become garbage - only the head and tail
> > + * words are zeroed to keep the padding invariant.
> > + */
> > +static inline void scx_cmask_reframe(struct scx_cmask *m, u32 base, u32 nr_cids)
> > +{
> > +	if (WARN_ON_ONCE(SCX_CMASK_NR_WORDS(nr_cids) > m->alloc_words))
> > +		return;
> 
> Considering that:
> 
>  #define SCX_CMASK_NR_WORDS(nr_cids)    (((nr_cids) + 63) / 64 + 1)
> 
> If we pass nr_cids == UINT_MAX here, we have:
> 
>  CMASK_NR_WORDS(UINT_MAX) = (UINT_MAX + 63)/64 + 1 = 62/64 + 1 = 1 (wraps)
> 
> Should we simply reject if it's greater than a certain reasonable upper bound?

I'm not sure what we do matters. No matter what, this would be a clear bug
and an unlikely one at that. As long as the backtrace is dumped, I think
anything is fine.

Thanks.

-- 
tejun

^ permalink raw reply	[flat|nested] 14+ messages in thread

* [PATCH v2 3/3] sched_ext: Add cmask mask ops
  2026-05-17 18:36 ` [PATCH 3/3] sched_ext: Add cmask mask ops Tejun Heo
@ 2026-05-18 23:58   ` Tejun Heo
  0 siblings, 0 replies; 14+ messages in thread
From: Tejun Heo @ 2026-05-18 23:58 UTC (permalink / raw)
  To: David Vernet, Andrea Righi, Changwoo Min
  Cc: sched-ext, Emil Tsalapatis, linux-kernel, Tejun Heo

Sub-sched cap code and other upcoming consumers need bulk cmask ops, both
mutating (and/or/copy/andnot) and predicate (subset/intersects/empty).

cmask_walk_op2() walks the intersection of two ranges word by word;
cmask_walk_op1() walks one range. Both are __always_inline and dispatched on
a compile-time-constant op enum, so each public entry collapses to a
specialized loop with the inner switch reduced to one arm.

Two-cmask ops only touch bits in the intersection of the two ranges; bits
outside are left unchanged. scx_cmask_or_racy() and scx_cmask_copy_racy()
mirror the locking forms but read @src word-by-word through data_race();
callers handle ordering with concurrent writers themselves.

v2: Add scx_cmask_empty().

Signed-off-by: Tejun Heo <tj@kernel.org>
---
 kernel/sched/ext_cid.c | 270 +++++++++++++++++++++++++++++++++++++++++
 kernel/sched/ext_cid.h |   9 ++
 2 files changed, 279 insertions(+)

diff --git a/kernel/sched/ext_cid.c b/kernel/sched/ext_cid.c
index 44dd47a87709..0c91b951fd33 100644
--- a/kernel/sched/ext_cid.c
+++ b/kernel/sched/ext_cid.c
@@ -397,6 +397,276 @@ __bpf_kfunc s32 scx_bpf_cpu_to_cid(s32 cpu, const struct bpf_prog_aux *aux)
 	return scx_cpu_to_cid(sch, cpu);
 }
 
+/*
+ * Set ops on cmasks. cmask_walk_op2() shares one walk across mutating
+ * (and/or/copy/andnot) and predicate (subset/intersects) two-cmask forms;
+ * cmask_walk_op1() does the same shape over a single cmask range. Every public
+ * entry passes a compile-time-constant @op; cmask_walk_op{1,2}() and
+ * cmask_word_op{1,2}() are __always_inline so the inner switch collapses to the
+ * selected op and cmask_op2_is_pred() folds the predicate early-exit out of
+ * mutating ops.
+ *
+ * Two-cmask ops only touch @dst bits inside the intersection of the two ranges;
+ * bits outside stay untouched. In particular, scx_cmask_copy() does NOT zero
+ * @dst bits that lie outside @src's range.
+ *
+ * The _RACY variants are otherwise identical to their non-racy counterpart but
+ * read @src word-by-word via data_race(). Memory ordering with concurrent
+ * writers is the caller's responsibility.
+ */
+enum cmask_op2 {
+	/* mutating */
+	CMASK_OP2_AND,
+	CMASK_OP2_OR,
+	CMASK_OP2_OR_RACY,
+	CMASK_OP2_COPY,
+	CMASK_OP2_COPY_RACY,
+	CMASK_OP2_ANDNOT,
+	/* predicates - short-circuit when the per-word result is true */
+	CMASK_OP2_SUBSET,
+	CMASK_OP2_INTERSECTS,
+};
+
+static __always_inline bool cmask_op2_is_pred(const enum cmask_op2 op)
+{
+	return op == CMASK_OP2_SUBSET || op == CMASK_OP2_INTERSECTS;
+}
+
+static __always_inline bool cmask_word_op2(u64 *av, const u64 *bp, u64 mask,
+					   const enum cmask_op2 op)
+{
+	switch (op) {
+	case CMASK_OP2_AND:
+		*av &= ~mask | *bp;
+		return false;
+	case CMASK_OP2_OR:
+		*av |= *bp & mask;
+		return false;
+	case CMASK_OP2_OR_RACY:
+		*av |= data_race(*bp) & mask;
+		return false;
+	case CMASK_OP2_COPY:
+		*av = (*av & ~mask) | (*bp & mask);
+		return false;
+	case CMASK_OP2_COPY_RACY:
+		*av = (*av & ~mask) | (data_race(*bp) & mask);
+		return false;
+	case CMASK_OP2_ANDNOT:
+		*av &= ~(*bp & mask);
+		return false;
+	case CMASK_OP2_SUBSET:
+		/* stop on the first bit in @sub not set in @super */
+		return (*bp & ~*av) & mask;
+	case CMASK_OP2_INTERSECTS:
+		return (*av & *bp) & mask;
+	}
+	unreachable();
+}
+
+/*
+ * Walk the intersection of [@a_base, @a_base + @a_nr_cids) with [@b_base,
+ * @b_base + @b_nr_cids) word by word, applying @op. Mutating ops walk all words
+ * and return false; predicates return true on the first word whose per-word
+ * test is true. Empty intersection returns false (matches "no bits to consider"
+ * for both mutate and predicate).
+ *
+ * Base/nr_cids are taken as parameters so callers with snapshotted bounds can
+ * drive the walk with values independent of the cmask's header.
+ */
+static __always_inline bool cmask_walk_op2(u64 *a_bits, u32 a_base, u32 a_nr_cids,
+					   const u64 *b_bits, u32 b_base, u32 b_nr_cids,
+					   const enum cmask_op2 op)
+{
+	u32 lo = max(a_base, b_base);
+	u32 hi = min(a_base + a_nr_cids, b_base + b_nr_cids);
+	u32 a_word_off = a_base / 64;
+	u32 b_word_off = b_base / 64;
+	u32 lo_word = lo / 64;
+	u32 hi_word = (hi - 1) / 64;
+	u64 head_mask = GENMASK_U64(63, lo & 63);
+	u64 tail_mask = GENMASK_U64((hi - 1) & 63, 0);
+	u32 w;
+
+	if (lo >= hi)
+		return false;
+
+	if (lo_word == hi_word)
+		return cmask_word_op2(&a_bits[lo_word - a_word_off],
+				      &b_bits[lo_word - b_word_off],
+				      head_mask & tail_mask, op);
+
+	if (cmask_word_op2(&a_bits[lo_word - a_word_off],
+			   &b_bits[lo_word - b_word_off], head_mask, op) &&
+	    cmask_op2_is_pred(op))
+		return true;
+
+	for (w = lo_word + 1; w < hi_word; w++)
+		if (cmask_word_op2(&a_bits[w - a_word_off],
+				   &b_bits[w - b_word_off], ~0ULL, op) &&
+		    cmask_op2_is_pred(op))
+			return true;
+
+	return cmask_word_op2(&a_bits[hi_word - a_word_off],
+			      &b_bits[hi_word - b_word_off], tail_mask, op);
+}
+
+enum cmask_op1 {
+	CMASK_OP1_ANY_SET,
+};
+
+static __always_inline bool cmask_word_op1(const u64 *ap, u64 mask,
+					   const enum cmask_op1 op)
+{
+	switch (op) {
+	case CMASK_OP1_ANY_SET:
+		return *ap & mask;
+	}
+	unreachable();
+}
+
+/*
+ * Walk [@a_base, @a_base + @a_nr_cids) of @a_bits word by word, applying @op.
+ * Returns true on the first word whose per-word test is true; returns false if
+ * no word matches or the range is empty. All current op1s short-circuit on
+ * per-word true; if a non-predicate op1 lands here, add a cmask_op1_is_pred()
+ * guard analogous to cmask_op2_is_pred().
+ */
+static __always_inline bool cmask_walk_op1(const u64 *a_bits, u32 a_base,
+					   u32 a_nr_cids,
+					   const enum cmask_op1 op)
+{
+	u32 lo = a_base;
+	u32 hi = a_base + a_nr_cids;
+	u32 a_word_off = a_base / 64;
+	u32 lo_word = lo / 64;
+	u32 hi_word = (hi - 1) / 64;
+	u64 head_mask = GENMASK_U64(63, lo & 63);
+	u64 tail_mask = GENMASK_U64((hi - 1) & 63, 0);
+	u32 w;
+
+	if (lo >= hi)
+		return false;
+
+	if (lo_word == hi_word)
+		return cmask_word_op1(&a_bits[lo_word - a_word_off],
+				      head_mask & tail_mask, op);
+
+	if (cmask_word_op1(&a_bits[lo_word - a_word_off], head_mask, op))
+		return true;
+	for (w = lo_word + 1; w < hi_word; w++)
+		if (cmask_word_op1(&a_bits[w - a_word_off], ~0ULL, op))
+			return true;
+	return cmask_word_op1(&a_bits[hi_word - a_word_off], tail_mask, op);
+}
+
+void scx_cmask_and(struct scx_cmask *dst, const struct scx_cmask *src)
+{
+	cmask_walk_op2(dst->bits, dst->base, dst->nr_cids,
+		       src->bits, src->base, src->nr_cids, CMASK_OP2_AND);
+}
+
+void scx_cmask_or(struct scx_cmask *dst, const struct scx_cmask *src)
+{
+	cmask_walk_op2(dst->bits, dst->base, dst->nr_cids,
+		       src->bits, src->base, src->nr_cids, CMASK_OP2_OR);
+}
+
+/**
+ * scx_cmask_or_racy - OR @src into @dst, reading @src without locking
+ *
+ * @src is read word-by-word through data_race(). Same per-bit independence
+ * rationale as scx_cmask_copy_racy(). Memory ordering with writers is the
+ * caller's responsibility.
+ */
+void scx_cmask_or_racy(struct scx_cmask *dst, const struct scx_cmask *src)
+{
+	cmask_walk_op2(dst->bits, dst->base, dst->nr_cids,
+		       src->bits, src->base, src->nr_cids, CMASK_OP2_OR_RACY);
+}
+
+void scx_cmask_copy(struct scx_cmask *dst, const struct scx_cmask *src)
+{
+	cmask_walk_op2(dst->bits, dst->base, dst->nr_cids,
+		       src->bits, src->base, src->nr_cids, CMASK_OP2_COPY);
+}
+
+/**
+ * scx_cmask_copy_racy - Snapshot @src into @dst without locking
+ *
+ * @src is read word-by-word through data_race(). Head/tail masking matches
+ * scx_cmask_copy(). Each bit in a cmask is independent, so partial updates
+ * just leave some bits fresher than others. Memory ordering with writers is
+ * the caller's responsibility.
+ */
+void scx_cmask_copy_racy(struct scx_cmask *dst, const struct scx_cmask *src)
+{
+	cmask_walk_op2(dst->bits, dst->base, dst->nr_cids,
+		       src->bits, src->base, src->nr_cids, CMASK_OP2_COPY_RACY);
+}
+
+void scx_cmask_andnot(struct scx_cmask *dst, const struct scx_cmask *src)
+{
+	cmask_walk_op2(dst->bits, dst->base, dst->nr_cids,
+		       src->bits, src->base, src->nr_cids, CMASK_OP2_ANDNOT);
+}
+
+/*
+ * Return true if @cm has any bit set in [@lo, @hi). Caller must ensure
+ * [@lo, @hi) is contained in @cm's range.
+ */
+static bool cmask_any_set_in_range(const struct scx_cmask *cm, u32 lo, u32 hi)
+{
+	if (lo >= hi)
+		return false;
+	return cmask_walk_op1(&cm->bits[lo / 64 - cm->base / 64], lo, hi - lo,
+			      CMASK_OP1_ANY_SET);
+}
+
+/**
+ * scx_cmask_subset - test whether @sub is a subset of @super
+ * @sub: cmask to test
+ * @super: cmask to test against
+ *
+ * Return true iff every set bit of @sub is also set in @super.
+ */
+bool scx_cmask_subset(const struct scx_cmask *sub, const struct scx_cmask *super)
+{
+	u32 super_end = super->base + super->nr_cids;
+	u32 sub_end = sub->base + sub->nr_cids;
+
+	/*
+	 * Set bits in @sub outside @super's range can't be in @super, so any
+	 * such bit means not a subset. The walk below only visits words
+	 * common to both ranges, so these need a separate scan.
+	 */
+	if (sub->base < super->base &&
+	    cmask_any_set_in_range(sub, sub->base, min(super->base, sub_end)))
+		return false;
+	if (sub_end > super_end &&
+	    cmask_any_set_in_range(sub, max(sub->base, super_end), sub_end))
+		return false;
+
+	return !cmask_walk_op2((u64 *)super->bits, super->base, super->nr_cids,
+			       sub->bits, sub->base, sub->nr_cids, CMASK_OP2_SUBSET);
+}
+
+bool scx_cmask_intersects(const struct scx_cmask *a, const struct scx_cmask *b)
+{
+	return cmask_walk_op2((u64 *)a->bits, a->base, a->nr_cids,
+			      b->bits, b->base, b->nr_cids, CMASK_OP2_INTERSECTS);
+}
+
+/**
+ * scx_cmask_empty - Test whether @m has no bits set
+ * @m: cmask to test
+ *
+ * Return true iff @m's active range has no bits set.
+ */
+bool scx_cmask_empty(const struct scx_cmask *m)
+{
+	return !cmask_any_set_in_range(m, m->base, m->base + m->nr_cids);
+}
+
 /**
  * scx_bpf_cid_topo - Copy out per-cid topology info
  * @cid: cid to look up
diff --git a/kernel/sched/ext_cid.h b/kernel/sched/ext_cid.h
index 223ed0e857ec..abea22ba2cc2 100644
--- a/kernel/sched/ext_cid.h
+++ b/kernel/sched/ext_cid.h
@@ -53,6 +53,15 @@ extern struct btf_id_set8 scx_kfunc_ids_init;
 
 void scx_cmask_clear(struct scx_cmask *m);
 void scx_cmask_fill(struct scx_cmask *m);
+void scx_cmask_and(struct scx_cmask *dst, const struct scx_cmask *src);
+void scx_cmask_or(struct scx_cmask *dst, const struct scx_cmask *src);
+void scx_cmask_or_racy(struct scx_cmask *dst, const struct scx_cmask *src);
+void scx_cmask_copy(struct scx_cmask *dst, const struct scx_cmask *src);
+void scx_cmask_copy_racy(struct scx_cmask *dst, const struct scx_cmask *src);
+void scx_cmask_andnot(struct scx_cmask *dst, const struct scx_cmask *src);
+bool scx_cmask_subset(const struct scx_cmask *sub, const struct scx_cmask *super);
+bool scx_cmask_intersects(const struct scx_cmask *a, const struct scx_cmask *b);
+bool scx_cmask_empty(const struct scx_cmask *m);
 s32 scx_cid_init(struct scx_sched *sch);
 int scx_cid_kfunc_init(void);
 void scx_cpumask_to_cmask(const struct cpumask *src, struct scx_cmask *dst);
-- 
2.54.0


^ permalink raw reply related	[flat|nested] 14+ messages in thread

* Re: [PATCH v2 2/3] sched_ext: Track bits[] storage size in struct scx_cmask
  2026-05-18 22:53       ` Tejun Heo
@ 2026-05-19  5:59         ` Andrea Righi
  0 siblings, 0 replies; 14+ messages in thread
From: Andrea Righi @ 2026-05-19  5:59 UTC (permalink / raw)
  To: Tejun Heo
  Cc: David Vernet, Changwoo Min, sched-ext, Emil Tsalapatis,
	linux-kernel

Hi Tejun,

On Mon, May 18, 2026 at 12:53:06PM -1000, Tejun Heo wrote:
> On Tue, May 19, 2026 at 12:11:35AM +0200, Andrea Righi wrote:
> > > +/**
> > > + * scx_cmask_reframe - Reshape @m's active range without resizing storage
> > > + * @m: cmask to reframe
> > > + * @base: new active range base
> > > + * @nr_cids: new active range length, must fit within @m->alloc_words
> > > + *
> > > + * Body bits within the new range become garbage - only the head and tail
> > > + * words are zeroed to keep the padding invariant.
> > > + */
> > > +static inline void scx_cmask_reframe(struct scx_cmask *m, u32 base, u32 nr_cids)
> > > +{
> > > +	if (WARN_ON_ONCE(SCX_CMASK_NR_WORDS(nr_cids) > m->alloc_words))
> > > +		return;
> > 
> > Considering that:
> > 
> >  #define SCX_CMASK_NR_WORDS(nr_cids)    (((nr_cids) + 63) / 64 + 1)
> > 
> > If we pass nr_cids == UINT_MAX here, we have:
> > 
> >  CMASK_NR_WORDS(UINT_MAX) = (UINT_MAX + 63)/64 + 1 = 62/64 + 1 = 1 (wraps)
> > 
> > Should we simply reject if it's greater than a certain reasonable upper bound?
> 
> I'm not sure what we do matters. No matter what, this would be a clear bug
> and an unlikely one at that. As long as the backtrace is dumped, I think
> anything is fine.

Agreed that the bug is unlikely to happen, but the WARN_ON_ONCE() wouldn't fire
at all for nr_cids == UINT_MAX.

However, IIUC scx_cmask_reframe() is internal kernel code, with no callers yet
and the upcoming consumers will probably drive nr_cids from bounded sources,
likely num_possible_cpus() and such. So, the wrap shouldn't really reachable on
the kernel side. Therefore, I guess we can simply drop the WARN_ON_ONCE().

Instead, cmask_reframe() is the mirrored version that is called from BPF, so any
loaded BPF prog can potentially pass an arbitrary u32.

How about changing the CMASK_NR_WORDS() macro as following?

 #define CMASK_NR_WORDS(nr_cids) ((u32)(((u64)(nr_cids) + 63) / 64 + 1))

In this way CMASK_NR_WORDS(UINT_MAX) returns ~67M instead of 1 and in this way
we'd get a clear scx_bpf_error() backtrace if it wraps. WDYT?

Thanks,
-Andrea

^ permalink raw reply	[flat|nested] 14+ messages in thread

* [PATCH v2 2/3] sched_ext: Track bits[] storage size in struct scx_cmask
  2026-05-19  7:58 [PATCHSET v2 sched_ext/for-7.2] sched_ext: cmask improvements Tejun Heo
@ 2026-05-19  7:58 ` Tejun Heo
  2026-05-19  8:48   ` sashiko-bot
  0 siblings, 1 reply; 14+ messages in thread
From: Tejun Heo @ 2026-05-19  7:58 UTC (permalink / raw)
  To: David Vernet, Andrea Righi, Changwoo Min
  Cc: sched-ext, Emil Tsalapatis, linux-kernel, Tejun Heo

scx_cmask carries @base and @nr_cids but not the bits[] allocation size, so
helpers reshaping the active range have no way to check it fits and later
kfuncs taking caller-provided storage can't validate it.

Add @alloc_words (u64 word count) annotated with __counted_by, and split the
bit-range API into three helpers:

- SCX_CMASK_DEFINE() / __SCX_CMASK_DEFINE() define an on-stack cmask, the
  latter taking an explicit capacity for oversized storage.
  SCX_CMASK_DEFINE_SHARD() is a thin wrapper that always reserves
  SCX_CID_SHARD_MAX_CPUS bits of storage.

- scx_cmask_init() / __scx_cmask_init() initialize a cmask, with the same
  tight-vs-explicit split.

- scx_cmask_reframe() reshapes the active range without resizing storage.

The BPF mirror (cmask_init / __cmask_init / cmask_reframe) gets the same
shape.

Add scx_cmask_clear() and scx_cmask_fill() to zero and set the
active-range bits respectively. scx_cpumask_to_cmask() uses
scx_cmask_clear(); scx_cmask_init() would otherwise re-write @alloc_words
on every call.

A later patch uses @alloc_words in scx_cmask_ref_shard() to refuse output
storage that can't hold the requested shard.

v2: Init per-CPU scx_set_cmask_scratch (was zero-init, emitted empty
    cmasks). Add nr_cids/alloc_cids check in BPF __cmask_init().
    (sashiko AI)
    Widen SCX_CMASK_NR_WORDS()/CMASK_NR_WORDS() to compute in u64 so that
    @nr_cids near U32_MAX no longer wraps to a small value and bypasses
    the bounds check in cmask_reframe(). (Andrea)

Signed-off-by: Tejun Heo <tj@kernel.org>
---
 kernel/sched/ext_cid.c                | 52 +++++++++++++++++--
 kernel/sched/ext_cid.h                | 57 ++++++++++++++++++++-
 kernel/sched/ext_types.h              | 62 +++++++++++++++++++----
 tools/sched_ext/include/scx/cid.bpf.h | 72 +++++++++++++++++++++++++--
 4 files changed, 224 insertions(+), 19 deletions(-)

diff --git a/kernel/sched/ext_cid.c b/kernel/sched/ext_cid.c
index bdd8ef8eae3d..44dd47a87709 100644
--- a/kernel/sched/ext_cid.c
+++ b/kernel/sched/ext_cid.c
@@ -55,6 +55,7 @@ static s32 scx_cid_arrays_alloc(void)
 	s16 *cid_to_cpu, *cpu_to_cid;
 	struct scx_cid_topo *cid_topo;
 	struct scx_cmask __percpu *set_cmask_scratch;
+	s32 cpu;
 
 	if (scx_cid_to_cpu_tbl)
 		return 0;
@@ -77,6 +78,9 @@ static s32 scx_cid_arrays_alloc(void)
 	WRITE_ONCE(scx_cid_to_cpu_tbl, cid_to_cpu);
 	WRITE_ONCE(scx_cpu_to_cid_tbl, cpu_to_cid);
 	WRITE_ONCE(scx_cid_topo, cid_topo);
+	for_each_possible_cpu(cpu)
+		scx_cmask_init(per_cpu_ptr(set_cmask_scratch, cpu),
+			       0, npossible);
 	WRITE_ONCE(scx_set_cmask_scratch, set_cmask_scratch);
 	return 0;
 }
@@ -222,19 +226,61 @@ s32 scx_cid_init(struct scx_sched *sch)
 	return 0;
 }
 
+/**
+ * scx_cmask_clear - Zero every bit in @m's active range
+ * @m: cmask to clear
+ *
+ * Storage past the active range is left as is.
+ */
+void scx_cmask_clear(struct scx_cmask *m)
+{
+	u32 nr_words;
+
+	if (!m->nr_cids)
+		return;
+	nr_words = (m->base + m->nr_cids - 1) / 64 - m->base / 64 + 1;
+	memset(m->bits, 0, nr_words * sizeof(u64));
+}
+
+/**
+ * scx_cmask_fill - Set every bit in @m's active range
+ * @m: cmask to fill
+ *
+ * Counterpart to scx_cmask_clear(). Storage past the active range is left as is.
+ */
+void scx_cmask_fill(struct scx_cmask *m)
+{
+	u32 nr_words, head_bits, tail_bits;
+
+	if (!m->nr_cids)
+		return;
+	nr_words = (m->base + m->nr_cids - 1) / 64 - m->base / 64 + 1;
+	memset(m->bits, 0xff, nr_words * sizeof(u64));
+
+	/* clear word-0 bits below base */
+	head_bits = m->base & 63;
+	if (head_bits)
+		m->bits[0] &= ~((1ULL << head_bits) - 1);
+
+	/* clear last-word bits at or past base + nr_cids */
+	tail_bits = (m->base + m->nr_cids) & 63;
+	if (tail_bits)
+		m->bits[nr_words - 1] &= (1ULL << tail_bits) - 1;
+}
+
 /**
  * scx_cpumask_to_cmask - Translate a kernel cpumask into a cmask
  * @src: source cpumask
  * @dst: cmask to write
  *
- * Initialize @dst to cover the full cid space [0, num_possible_cpus()) and
- * set the bit for each cid whose cpu is in @src.
+ * Clear @dst's active range and set the bit for each cid whose cpu is in
+ * @src and lies within that range. Out-of-range cids are silently ignored.
  */
 void scx_cpumask_to_cmask(const struct cpumask *src, struct scx_cmask *dst)
 {
 	s32 cpu;
 
-	scx_cmask_init(dst, 0, num_possible_cpus());
+	scx_cmask_clear(dst);
 	for_each_cpu(cpu, src) {
 		s32 cid = __scx_cpu_to_cid(cpu);
 
diff --git a/kernel/sched/ext_cid.h b/kernel/sched/ext_cid.h
index e1c44a180bb1..223ed0e857ec 100644
--- a/kernel/sched/ext_cid.h
+++ b/kernel/sched/ext_cid.h
@@ -51,6 +51,8 @@ extern s16 *scx_cpu_to_cid_tbl;
 extern struct scx_cid_topo *scx_cid_topo;
 extern struct btf_id_set8 scx_kfunc_ids_init;
 
+void scx_cmask_clear(struct scx_cmask *m);
+void scx_cmask_fill(struct scx_cmask *m);
 s32 scx_cid_init(struct scx_sched *sch);
 int scx_cid_kfunc_init(void);
 void scx_cpumask_to_cmask(const struct cpumask *src, struct scx_cmask *dst);
@@ -147,11 +149,64 @@ static inline u64 *__scx_cmask_word(const struct scx_cmask *m, u32 cid)
 	return (u64 *)&m->bits[cid / 64 - m->base / 64];
 }
 
+/**
+ * __scx_cmask_init - Initialize @m with explicit storage capacity
+ * @m: cmask to initialize
+ * @base: first cid of the active range
+ * @nr_cids: number of cids in the active range
+ * @alloc_cids: storage capacity in cids, at least @nr_cids
+ *
+ * Use when storage is sized larger than the initial active range. All of
+ * bits[] is zeroed.
+ */
+static inline void __scx_cmask_init(struct scx_cmask *m, u32 base, u32 nr_cids,
+				    u32 alloc_cids)
+{
+	if (WARN_ON_ONCE(alloc_cids < nr_cids))
+		nr_cids = alloc_cids;
+
+	m->base = base;
+	m->nr_cids = nr_cids;
+	m->alloc_words = SCX_CMASK_NR_WORDS(alloc_cids);
+	memset(m->bits, 0, m->alloc_words * sizeof(u64));
+}
+
+/**
+ * scx_cmask_init - Initialize @m on tight storage
+ * @m: cmask to initialize
+ * @base: first cid of the active range
+ * @nr_cids: number of cids in the active range
+ *
+ * All of bits[] is zeroed.
+ */
 static inline void scx_cmask_init(struct scx_cmask *m, u32 base, u32 nr_cids)
 {
+	__scx_cmask_init(m, base, nr_cids, nr_cids);
+}
+
+/**
+ * scx_cmask_reframe - Reshape @m's active range without resizing storage
+ * @m: cmask to reframe
+ * @base: new active range base
+ * @nr_cids: new active range length, must fit within @m->alloc_words
+ *
+ * Body bits within the new range become garbage - only the head and tail
+ * words are zeroed to keep the padding invariant.
+ */
+static inline void scx_cmask_reframe(struct scx_cmask *m, u32 base, u32 nr_cids)
+{
+	if (WARN_ON_ONCE(SCX_CMASK_NR_WORDS(nr_cids) > m->alloc_words))
+		return;
+
+	if (nr_cids) {
+		u32 last_word = ((base & 63) + nr_cids - 1) / 64;
+
+		m->bits[0] = 0;
+		m->bits[last_word] = 0;
+	}
+
 	m->base = base;
 	m->nr_cids = nr_cids;
-	memset(m->bits, 0, SCX_CMASK_NR_WORDS(nr_cids) * sizeof(u64));
 }
 
 static inline void __scx_cmask_set(struct scx_cmask *m, u32 cid)
diff --git a/kernel/sched/ext_types.h b/kernel/sched/ext_types.h
index c6c4e3db7311..8b3527e21fca 100644
--- a/kernel/sched/ext_types.h
+++ b/kernel/sched/ext_types.h
@@ -69,9 +69,10 @@ struct scx_cid_topo {
  *
  * A cmask covers the cid range [base, base + nr_cids). bits[] is aligned to the
  * global 64-cid grid: bits[0] spans [base & ~63, (base & ~63) + 64), so the
- * first (base & 63) bits of bits[0] are head padding and any tail past base +
- * nr_cids is tail padding. Both must stay zero for the lifetime of the mask;
- * all mutating helpers preserve that invariant.
+ * first (base & 63) bits of bits[0] are head padding and the trailing bits of
+ * the last active word past base + nr_cids are tail padding. Both stay zero;
+ * all mutating helpers preserve that. Words past the last active word are not
+ * read by any helper and have no constraint.
  *
  * Grid alignment means two cmasks always address bits[] against the same global
  * 64-cid windows, so cross-cmask word ops (AND, OR, ...) reduce to
@@ -83,22 +84,61 @@ struct scx_cid_topo {
 struct scx_cmask {
 	u32 base;
 	u32 nr_cids;
-	DECLARE_FLEX_ARRAY(u64, bits);
+	u32 alloc_words;
+	u64 bits[] __counted_by(alloc_words);
 };
 
 /*
  * Number of u64 words of bits[] storage that covers @nr_cids regardless of base
  * alignment. The +1 absorbs up to 63 bits of head padding when base is not
  * 64-aligned - always allocating one extra word beats branching on base or
- * splitting the compute.
+ * splitting the compute. The u64 cast keeps the +63 from wrapping when @nr_cids
+ * is near U32_MAX, so callers bounds-checking the result against @alloc_words
+ * catch the overflow instead of seeing a small value.
  */
-#define SCX_CMASK_NR_WORDS(nr_cids)	(((nr_cids) + 63) / 64 + 1)
+#define SCX_CMASK_NR_WORDS(nr_cids)	((u32)(((u64)(nr_cids) + 63) / 64 + 1))
 
-/*
- * Define an on-stack cmask for up to @cap_bits. @name is a struct scx_cmask *
- * aliasing zero-initialized storage; call scx_cmask_init() to set base/nr_cids.
+/**
+ * __SCX_CMASK_DEFINE - Define an on-stack cmask with explicit storage capacity
+ * @NAME: variable name to define
+ * @BASE: first cid of the active range
+ * @NR_CIDS: active range length
+ * @ALLOC_CIDS: storage capacity in cids, at least @NR_CIDS
+ *
+ * @NAME aliases zero-initialized storage with the active range set to
+ * [BASE, BASE + NR_CIDS). Use scx_cmask_reframe() to reshape later, up to
+ * @ALLOC_CIDS.
+ */
+#define __SCX_CMASK_DEFINE(NAME, BASE, NR_CIDS, ALLOC_CIDS)			\
+	_DEFINE_FLEX(struct scx_cmask, NAME, bits, SCX_CMASK_NR_WORDS(ALLOC_CIDS), \
+		     = { .base = (BASE),					\
+			 .nr_cids = (NR_CIDS),					\
+			 .alloc_words = SCX_CMASK_NR_WORDS(ALLOC_CIDS) })
+
+/**
+ * SCX_CMASK_DEFINE - Define an on-stack cmask on tight storage
+ * @NAME: variable name to define
+ * @BASE: first cid of the active range
+ * @NR_CIDS: active range length, also storage capacity
+ *
+ * @NAME aliases zero-initialized storage with the active range and storage
+ * both [BASE, BASE + NR_CIDS).
+ */
+#define SCX_CMASK_DEFINE(NAME, BASE, NR_CIDS)					\
+	__SCX_CMASK_DEFINE(NAME, BASE, NR_CIDS, NR_CIDS)
+
+/**
+ * SCX_CMASK_DEFINE_SHARD - Define an on-stack cmask sized to one shard
+ * @NAME: variable name to define
+ * @BASE: first cid of the active range
+ * @NR_CIDS: active range length, must be <= SCX_CID_SHARD_MAX_CPUS
+ *
+ * Storage is fixed at SCX_CID_SHARD_MAX_CPUS, active range framed by
+ * (BASE, NR_CIDS). Passing NR_CIDS > SCX_CID_SHARD_MAX_CPUS leaves the
+ * cmask claiming more bits than storage holds and subsequent cmask
+ * operations will overrun.
  */
-#define SCX_CMASK_DEFINE(name, cap_bits)	\
-	DEFINE_RAW_FLEX(struct scx_cmask, name, bits, SCX_CMASK_NR_WORDS(cap_bits))
+#define SCX_CMASK_DEFINE_SHARD(NAME, BASE, NR_CIDS)				\
+	__SCX_CMASK_DEFINE(NAME, BASE, NR_CIDS, SCX_CID_SHARD_MAX_CPUS)
 
 #endif /* _KERNEL_SCHED_EXT_TYPES_H */
diff --git a/tools/sched_ext/include/scx/cid.bpf.h b/tools/sched_ext/include/scx/cid.bpf.h
index 182fed233abc..e281c88fa824 100644
--- a/tools/sched_ext/include/scx/cid.bpf.h
+++ b/tools/sched_ext/include/scx/cid.bpf.h
@@ -32,7 +32,13 @@
 #define CMASK_MAX_WORDS 129
 #endif
 
-#define CMASK_NR_WORDS(nr_cids)		(((nr_cids) + 63) / 64 + 1)
+/*
+ * Mirrors SCX_CMASK_NR_WORDS in kernel/sched/ext_types.h. The u64 cast keeps
+ * the +63 from wrapping when @nr_cids is near U32_MAX, so cmask_reframe()
+ * bounds-checking the result against alloc_words catches the overflow instead
+ * of seeing a small value.
+ */
+#define CMASK_NR_WORDS(nr_cids)		((u32)(((u64)(nr_cids) + 63) / 64 + 1))
 
 static __always_inline bool __cmask_contains(const struct scx_cmask __arena *m, u32 cid)
 {
@@ -44,20 +50,78 @@ static __always_inline u64 __arena *__cmask_word(const struct scx_cmask __arena
 	return (u64 __arena *)&m->bits[cid / 64 - m->base / 64];
 }
 
-static __always_inline void cmask_init(struct scx_cmask __arena *m, u32 base, u32 nr_cids)
+/**
+ * __cmask_init - Initialize @m with explicit storage capacity
+ * @m: cmask to initialize
+ * @base: first cid of the active range
+ * @nr_cids: number of cids in the active range
+ * @alloc_cids: storage capacity in cids, at least @nr_cids
+ *
+ * Use when storage is sized larger than the initial active range. All of
+ * bits[] is zeroed.
+ */
+static __always_inline void __cmask_init(struct scx_cmask __arena *m, u32 base,
+					 u32 nr_cids, u32 alloc_cids)
 {
-	u32 nr_words = CMASK_NR_WORDS(nr_cids), i;
+	u32 alloc_words, i;
+
+	if (unlikely(nr_cids > alloc_cids)) {
+		scx_bpf_error("__cmask_init: nr_cids=%u exceeds alloc_cids=%u",
+			      nr_cids, alloc_cids);
+		return;
+	}
+	alloc_words = CMASK_NR_WORDS(alloc_cids);
 
 	m->base = base;
 	m->nr_cids = nr_cids;
+	m->alloc_words = alloc_words;
 
 	bpf_for(i, 0, CMASK_MAX_WORDS) {
-		if (i >= nr_words)
+		if (i >= alloc_words)
 			break;
 		m->bits[i] = 0;
 	}
 }
 
+/**
+ * cmask_init - Initialize @m on tight storage
+ * @m: cmask to initialize
+ * @base: first cid of the active range
+ * @nr_cids: number of cids in the active range
+ *
+ * All of bits[] is zeroed.
+ */
+static __always_inline void cmask_init(struct scx_cmask __arena *m, u32 base, u32 nr_cids)
+{
+	__cmask_init(m, base, nr_cids, nr_cids);
+}
+
+/**
+ * cmask_reframe - Reshape @m's active range without resizing storage
+ * @m: cmask to reframe
+ * @base: new active range base
+ * @nr_cids: new active range length, must fit within @m->alloc_words
+ *
+ * Body bits within the new range become garbage - only the head and tail
+ * words are zeroed to keep the padding invariant.
+ */
+static __always_inline void cmask_reframe(struct scx_cmask __arena *m, u32 base, u32 nr_cids)
+{
+	if (CMASK_NR_WORDS(nr_cids) > m->alloc_words) {
+		scx_bpf_error("cmask_reframe: nr_cids=%u exceeds alloc_words=%u",
+			      nr_cids, m->alloc_words);
+		return;
+	}
+	if (nr_cids) {
+		u32 last_word = ((base & 63) + nr_cids - 1) / 64;
+
+		m->bits[0] = 0;
+		m->bits[last_word] = 0;
+	}
+	m->base = base;
+	m->nr_cids = nr_cids;
+}
+
 static __always_inline bool cmask_test(const struct scx_cmask __arena *m, u32 cid)
 {
 	if (!__cmask_contains(m, cid))
-- 
2.54.0


^ permalink raw reply related	[flat|nested] 14+ messages in thread

* Re: [PATCH v2 2/3] sched_ext: Track bits[] storage size in struct scx_cmask
  2026-05-19  7:58 ` [PATCH v2 2/3] sched_ext: Track bits[] storage size in struct scx_cmask Tejun Heo
@ 2026-05-19  8:48   ` sashiko-bot
  0 siblings, 0 replies; 14+ messages in thread
From: sashiko-bot @ 2026-05-19  8:48 UTC (permalink / raw)
  To: Tejun Heo; +Cc: sched-ext

Thank you for your contribution! Sashiko AI review found 4 potential issue(s) to consider:
- [High] The `__SCX_CMASK_DEFINE` macro leaves the `bits[]` flexible array uninitialized, causing subsequent operations to operate on stack garbage.
- [Medium] The BPF helper `__cmask_init()` returns early on failure without initializing the structure, exposing uninitialized memory to the executing BPF program.
- [Medium] BPF `__cmask_init()` does not validate `alloc_cids` against `CMASK_MAX_WORDS`, leading to inconsistent state and truncated operations.
- [Medium] Integer wrap-around in `cmask_reframe()` and `scx_cmask_reframe()` leaves tail padding uncleared when `nr_cids` is near `U32_MAX`.
--

commit af0adccc546dd475652f096375ea97f8c951b0f6
Author: Tejun Heo <tj@kernel.org>

sched_ext: Track bits[] storage size in struct scx_cmask

This commit adds tracking of the allocated word count to scx_cmask, splits
the bit-range API into explicit-capacity and tight-capacity helpers, and
provides equivalent BPF mirror functions. It also adds clear and fill
helpers to manage the active-range bits safely.

diff --git a/kernel/sched/ext_cid.h b/kernel/sched/ext_cid.h
--- a/kernel/sched/ext_cid.h
+++ b/kernel/sched/ext_cid.h
[ ... ]
> +static inline void scx_cmask_reframe(struct scx_cmask *m, u32 base, u32 nr_cids)
> +{
> +	if (WARN_ON_ONCE(SCX_CMASK_NR_WORDS(nr_cids) > m->alloc_words))
> +		return;
> +
> +	if (nr_cids) {
> +		u32 last_word = ((base & 63) + nr_cids - 1) / 64;

The commit message states:
"Widen SCX_CMASK_NR_WORDS()/CMASK_NR_WORDS() to compute in u64 so that
@nr_cids near U32_MAX no longer wraps to a small value and bypasses
the bounds check in cmask_reframe()."

Despite this, could the last_word calculation here still wrap around?

If nr_cids is near U32_MAX, ((base & 63) + nr_cids - 1) might overflow the
32-bit u32, resulting in a small last_word value (potentially 0). This would
cause the head word to be cleared again instead of the intended tail word,
leaving tail padding uncleared.

diff --git a/kernel/sched/ext_types.h b/kernel/sched/ext_types.h
--- a/kernel/sched/ext_types.h
+++ b/kernel/sched/ext_types.h
[ ... ]
> +#define __SCX_CMASK_DEFINE(NAME, BASE, NR_CIDS, ALLOC_CIDS)			\
> +	_DEFINE_FLEX(struct scx_cmask, NAME, bits, SCX_CMASK_NR_WORDS(ALLOC_CIDS), \
> +		     = { .base = (BASE),					\
> +			 .nr_cids = (NR_CIDS),					\
> +			 .alloc_words = SCX_CMASK_NR_WORDS(ALLOC_CIDS) })

Does _DEFINE_FLEX with a designated initializer fully zero-initialize the
flexible array members?

In C, initializing specific struct members does not necessarily zero out
the remaining padding or flexible array bytes. The documentation mentions
this macro aliases zero-initialized storage, but without the explicit
memset() that was previously handled by scx_cmask_init(), could the bits
array end up containing stack garbage?

diff --git a/tools/sched_ext/include/scx/cid.bpf.h b/tools/sched_ext/include/scx/cid.bpf.h
--- a/tools/sched_ext/include/scx/cid.bpf.h
+++ b/tools/sched_ext/include/scx/cid.bpf.h
[ ... ]
> +static __always_inline void __cmask_init(struct scx_cmask __arena *m, u32 base,
> +					 u32 nr_cids, u32 alloc_cids)
>  {
> -	u32 nr_words = CMASK_NR_WORDS(nr_cids), i;
> +	u32 alloc_words, i;
> +
> +	if (unlikely(nr_cids > alloc_cids)) {
> +		scx_bpf_error("__cmask_init: nr_cids=%u exceeds alloc_cids=%u",
> +			      nr_cids, alloc_cids);
> +		return;
> +	}

Because scx_bpf_error() schedules an asynchronous abort rather than halting
the BPF program synchronously, will the program continue executing with an
uninitialized mask?

The kernel counterpart __scx_cmask_init() safely clamps the value to
alloc_cids instead of returning early. Would clamping the value or zeroing
the memory here prevent subsequent BPF logic from operating on uninitialized
memory?

> +	alloc_words = CMASK_NR_WORDS(alloc_cids);
>  
>  	m->base = base;
>  	m->nr_cids = nr_cids;
> +	m->alloc_words = alloc_words;
>  
>  	bpf_for(i, 0, CMASK_MAX_WORDS) {
> -		if (i >= nr_words)
> +		if (i >= alloc_words)
>  			break;
>  		m->bits[i] = 0;
>  	}
>  }

If alloc_cids is very large, could alloc_words exceed CMASK_MAX_WORDS?

If so, this initialization loop will stop at CMASK_MAX_WORDS (129), leaving
the remaining words uninitialized. However, m->alloc_words will still record
the larger size. Should there be a validation check against CMASK_MAX_WORDS
to prevent silent truncation in this and other bitmask operations?

[ ... ]
> +static __always_inline void cmask_reframe(struct scx_cmask __arena *m, u32 base, u32 nr_cids)
> +{
> +	if (CMASK_NR_WORDS(nr_cids) > m->alloc_words) {
> +		scx_bpf_error("cmask_reframe: nr_cids=%u exceeds alloc_words=%u",
> +			      nr_cids, m->alloc_words);
> +		return;
> +	}
> +	if (nr_cids) {
> +		u32 last_word = ((base & 63) + nr_cids - 1) / 64;

Similar to the kernel helper scx_cmask_reframe(), could this calculation wrap
around 32-bit limits if nr_cids is near U32_MAX, resulting in last_word being
calculated incorrectly?

-- 
Sashiko AI review · https://sashiko.dev/#/patchset/20260519075838.2706712-1-tj@kernel.org?part=2

^ permalink raw reply	[flat|nested] 14+ messages in thread

end of thread, other threads:[~2026-05-19  8:48 UTC | newest]

Thread overview: 14+ messages (download: mbox.gz follow: Atom feed
-- links below jump to the message on this page --
2026-05-17 18:36 [PATCHSET RESEND sched_ext/for-7.2] sched_ext: cmask improvements Tejun Heo
2026-05-17 18:36 ` [PATCH 1/3] sched_ext: Rename scx_cmask.nr_bits to nr_cids Tejun Heo
2026-05-17 18:43   ` sashiko-bot
2026-05-17 19:02   ` [PATCH v2 " Tejun Heo
2026-05-17 18:36 ` [PATCH 2/3] sched_ext: Track bits[] storage size in struct scx_cmask Tejun Heo
2026-05-17 19:14   ` sashiko-bot
2026-05-17 19:29   ` [PATCH v2 " Tejun Heo
2026-05-18 22:11     ` Andrea Righi
2026-05-18 22:53       ` Tejun Heo
2026-05-19  5:59         ` Andrea Righi
2026-05-17 18:36 ` [PATCH 3/3] sched_ext: Add cmask mask ops Tejun Heo
2026-05-18 23:58   ` [PATCH v2 " Tejun Heo
  -- strict thread matches above, loose matches on Subject: below --
2026-05-19  7:58 [PATCHSET v2 sched_ext/for-7.2] sched_ext: cmask improvements Tejun Heo
2026-05-19  7:58 ` [PATCH v2 2/3] sched_ext: Track bits[] storage size in struct scx_cmask Tejun Heo
2026-05-19  8:48   ` sashiko-bot

This is an external index of several public inboxes,
see mirroring instructions on how to clone and mirror
all data and code used by this external index.