* [PATCH 10/17] sched_ext: Add cmask, a base-windowed bitmap over cid space
@ 2026-04-24 1:32 Tejun Heo
0 siblings, 0 replies; 3+ messages in thread
From: Tejun Heo @ 2026-04-24 1:32 UTC (permalink / raw)
To: David Vernet, Andrea Righi, Changwoo Min
Cc: sched-ext, emil, linux-kernel, Cheng-Yang Chou, Zhao Mengmeng,
Tejun Heo
Sub-scheduler code built on cids needs bitmaps scoped to a slice of cid
space (e.g. the idle cids of a shard). A cpumask sized for NR_CPUS wastes
most of its bits for a small window and is awkward in BPF.
scx_cmask covers [base, base + nr_bits). bits[] is aligned to the global
64-cid grid: bits[0] spans [base & ~63, (base & ~63) + 64). Any two
cmasks therefore address bits[] against the same global windows, so
cross-cmask word ops reduce to
dest->bits[i] OP= operand->bits[i - delta]
with no bit-shifting, at the cost of up to one extra storage word for
head misalignment. This alignment guarantee is the reason binary ops
can stay word-level; every mutating helper preserves it.
Kernel side in ext_cid.[hc]; BPF side in tools/sched_ext/include/scx/
cid.bpf.h. BPF side drops the scx_ prefix (redundant in BPF code) and
adds the extra helpers that basic idle-cpu selection needs.
No callers yet.
v2: Narrow to helpers that will be used in the planned changes;
set/bit/find/zero ops will be added as usage develops.
Signed-off-by: Tejun Heo <tj@kernel.org>
Reviewed-by: Cheng-Yang Chou <yphbchou0911@gmail.com>
---
kernel/sched/ext_cid.h | 25 +
kernel/sched/ext_types.h | 38 ++
tools/sched_ext/include/scx/cid.bpf.h | 597 +++++++++++++++++++++++
tools/sched_ext/include/scx/common.bpf.h | 1 +
4 files changed, 661 insertions(+)
create mode 100644 tools/sched_ext/include/scx/cid.bpf.h
diff --git a/kernel/sched/ext_cid.h b/kernel/sched/ext_cid.h
index 52edb66b53fd..c3c429d2c8e2 100644
--- a/kernel/sched/ext_cid.h
+++ b/kernel/sched/ext_cid.h
@@ -127,4 +127,29 @@ static inline s32 scx_cpu_to_cid(struct scx_sched *sch, s32 cpu)
return __scx_cpu_to_cid(cpu);
}
+static inline bool __scx_cmask_contains(const struct scx_cmask *m, u32 cid)
+{
+ return likely(cid >= m->base && cid < m->base + m->nr_bits);
+}
+
+/* Word in bits[] covering @cid. @cid must satisfy __scx_cmask_contains(). */
+static inline u64 *__scx_cmask_word(const struct scx_cmask *m, u32 cid)
+{
+ return (u64 *)&m->bits[cid / 64 - m->base / 64];
+}
+
+static inline void scx_cmask_init(struct scx_cmask *m, u32 base, u32 nr_bits)
+{
+ m->base = base;
+ m->nr_bits = nr_bits;
+ memset(m->bits, 0, SCX_CMASK_NR_WORDS(nr_bits) * sizeof(u64));
+}
+
+static inline void __scx_cmask_set(struct scx_cmask *m, u32 cid)
+{
+ if (!__scx_cmask_contains(m, cid))
+ return;
+ *__scx_cmask_word(m, cid) |= BIT_U64(cid & 63);
+}
+
#endif /* _KERNEL_SCHED_EXT_CID_H */
diff --git a/kernel/sched/ext_types.h b/kernel/sched/ext_types.h
index be4d3565ae8d..ebb8cdf90612 100644
--- a/kernel/sched/ext_types.h
+++ b/kernel/sched/ext_types.h
@@ -63,4 +63,42 @@ struct scx_cid_topo {
s32 node_idx;
};
+/*
+ * cmask: variable-length, base-windowed bitmap over cid space
+ * -----------------------------------------------------------
+ *
+ * A cmask covers the cid range [base, base + nr_bits). bits[] is aligned to the
+ * global 64-cid grid: bits[0] spans [base & ~63, (base & ~63) + 64), so the
+ * first (base & 63) bits of bits[0] are head padding and any tail past base +
+ * nr_bits is tail padding. Both must stay zero for the lifetime of the mask;
+ * all mutating helpers preserve that invariant.
+ *
+ * Grid alignment means two cmasks always address bits[] against the same global
+ * 64-cid windows, so cross-cmask word ops (AND, OR, ...) reduce to
+ *
+ * dst->bits[i] OP= src->bits[i - delta]
+ *
+ * with no bit-shifting, regardless of how the two bases relate mod 64.
+ */
+struct scx_cmask {
+ u32 base;
+ u32 nr_bits;
+ DECLARE_FLEX_ARRAY(u64, bits);
+};
+
+/*
+ * Number of u64 words of bits[] storage that covers @nr_bits regardless of base
+ * alignment. The +1 absorbs up to 63 bits of head padding when base is not
+ * 64-aligned - always allocating one extra word beats branching on base or
+ * splitting the compute.
+ */
+#define SCX_CMASK_NR_WORDS(nr_bits) (((nr_bits) + 63) / 64 + 1)
+
+/*
+ * Define an on-stack cmask for up to @cap_bits. @name is a struct scx_cmask *
+ * aliasing zero-initialized storage; call scx_cmask_init() to set base/nr_bits.
+ */
+#define SCX_CMASK_DEFINE(name, cap_bits) \
+ DEFINE_RAW_FLEX(struct scx_cmask, name, bits, SCX_CMASK_NR_WORDS(cap_bits))
+
#endif /* _KERNEL_SCHED_EXT_TYPES_H */
diff --git a/tools/sched_ext/include/scx/cid.bpf.h b/tools/sched_ext/include/scx/cid.bpf.h
new file mode 100644
index 000000000000..9b6c757d2f97
--- /dev/null
+++ b/tools/sched_ext/include/scx/cid.bpf.h
@@ -0,0 +1,597 @@
+/* SPDX-License-Identifier: GPL-2.0 */
+/*
+ * BPF-side helpers for cids and cmasks. See kernel/sched/ext_cid.h for the
+ * authoritative layout and semantics. The BPF-side helpers use the cmask_*
+ * naming (no scx_ prefix); cmask is the SCX bitmap type so the prefix is
+ * redundant in BPF code. Atomics use __sync_val_compare_and_swap and every
+ * helper is inline (no .c counterpart).
+ *
+ * Included by scx/common.bpf.h; don't include directly.
+ *
+ * Copyright (c) 2026 Meta Platforms, Inc. and affiliates.
+ * Copyright (c) 2026 Tejun Heo <tj@kernel.org>
+ */
+#ifndef __SCX_CID_BPF_H
+#define __SCX_CID_BPF_H
+
+#include "bpf_arena_common.bpf.h"
+
+#ifndef BIT_U64
+#define BIT_U64(nr) (1ULL << (nr))
+#endif
+#ifndef GENMASK_U64
+#define GENMASK_U64(h, l) ((~0ULL << (l)) & (~0ULL >> (63 - (h))))
+#endif
+
+/*
+ * Storage cap for bounded loops over bits[]. Sized to cover NR_CPUS=8192 with
+ * one extra word for head-misalignment. Increase if deployment targets larger
+ * NR_CPUS.
+ */
+#ifndef CMASK_MAX_WORDS
+#define CMASK_MAX_WORDS 129
+#endif
+
+#define CMASK_NR_WORDS(nr_bits) (((nr_bits) + 63) / 64 + 1)
+
+static __always_inline bool __cmask_contains(const struct scx_cmask __arena *m, u32 cid)
+{
+ return cid >= m->base && cid < m->base + m->nr_bits;
+}
+
+static __always_inline u64 __arena *__cmask_word(const struct scx_cmask __arena *m, u32 cid)
+{
+ return (u64 __arena *)&m->bits[cid / 64 - m->base / 64];
+}
+
+static __always_inline void cmask_init(struct scx_cmask __arena *m, u32 base, u32 nr_bits)
+{
+ u32 nr_words = CMASK_NR_WORDS(nr_bits), i;
+
+ m->base = base;
+ m->nr_bits = nr_bits;
+
+ bpf_for(i, 0, CMASK_MAX_WORDS) {
+ if (i >= nr_words)
+ break;
+ m->bits[i] = 0;
+ }
+}
+
+static __always_inline bool cmask_test(const struct scx_cmask __arena *m, u32 cid)
+{
+ if (!__cmask_contains(m, cid))
+ return false;
+ return *__cmask_word(m, cid) & BIT_U64(cid & 63);
+}
+
+/*
+ * x86 BPF JIT rejects BPF_OR | BPF_FETCH and BPF_AND | BPF_FETCH on arena
+ * pointers (see bpf_jit_supports_insn() in arch/x86/net/bpf_jit_comp.c). Only
+ * BPF_CMPXCHG / BPF_XCHG / BPF_ADD with FETCH are allowed. Implement
+ * test_and_{set,clear} and the atomic set/clear via a cmpxchg loop.
+ *
+ * CMASK_CAS_TRIES is far above what any non-pathological contention needs.
+ * Exhausting it means the bit update was lost, which corrupts the caller's view
+ * of the bitmap, so raise scx_bpf_error() to abort the scheduler.
+ */
+#define CMASK_CAS_TRIES 1024
+
+static __always_inline void cmask_set(struct scx_cmask __arena *m, u32 cid)
+{
+ u64 __arena *w;
+ u64 bit, old, new;
+ u32 i;
+
+ if (!__cmask_contains(m, cid))
+ return;
+ w = __cmask_word(m, cid);
+ bit = BIT_U64(cid & 63);
+ bpf_for(i, 0, CMASK_CAS_TRIES) {
+ old = *w;
+ if (old & bit)
+ return;
+ new = old | bit;
+ if (__sync_val_compare_and_swap(w, old, new) == old)
+ return;
+ }
+ scx_bpf_error("cmask_set CAS exhausted at cid %u", cid);
+}
+
+static __always_inline void cmask_clear(struct scx_cmask __arena *m, u32 cid)
+{
+ u64 __arena *w;
+ u64 bit, old, new;
+ u32 i;
+
+ if (!__cmask_contains(m, cid))
+ return;
+ w = __cmask_word(m, cid);
+ bit = BIT_U64(cid & 63);
+ bpf_for(i, 0, CMASK_CAS_TRIES) {
+ old = *w;
+ if (!(old & bit))
+ return;
+ new = old & ~bit;
+ if (__sync_val_compare_and_swap(w, old, new) == old)
+ return;
+ }
+ scx_bpf_error("cmask_clear CAS exhausted at cid %u", cid);
+}
+
+static __always_inline bool cmask_test_and_set(struct scx_cmask __arena *m, u32 cid)
+{
+ u64 __arena *w;
+ u64 bit, old, new;
+ u32 i;
+
+ if (!__cmask_contains(m, cid))
+ return false;
+ w = __cmask_word(m, cid);
+ bit = BIT_U64(cid & 63);
+ bpf_for(i, 0, CMASK_CAS_TRIES) {
+ old = *w;
+ if (old & bit)
+ return true;
+ new = old | bit;
+ if (__sync_val_compare_and_swap(w, old, new) == old)
+ return false;
+ }
+ scx_bpf_error("cmask_test_and_set CAS exhausted at cid %u", cid);
+ return false;
+}
+
+static __always_inline bool cmask_test_and_clear(struct scx_cmask __arena *m, u32 cid)
+{
+ u64 __arena *w;
+ u64 bit, old, new;
+ u32 i;
+
+ if (!__cmask_contains(m, cid))
+ return false;
+ w = __cmask_word(m, cid);
+ bit = BIT_U64(cid & 63);
+ bpf_for(i, 0, CMASK_CAS_TRIES) {
+ old = *w;
+ if (!(old & bit))
+ return false;
+ new = old & ~bit;
+ if (__sync_val_compare_and_swap(w, old, new) == old)
+ return true;
+ }
+ scx_bpf_error("cmask_test_and_clear CAS exhausted at cid %u", cid);
+ return false;
+}
+
+static __always_inline void __cmask_set(struct scx_cmask __arena *m, u32 cid)
+{
+ if (!__cmask_contains(m, cid))
+ return;
+ *__cmask_word(m, cid) |= BIT_U64(cid & 63);
+}
+
+static __always_inline void __cmask_clear(struct scx_cmask __arena *m, u32 cid)
+{
+ if (!__cmask_contains(m, cid))
+ return;
+ *__cmask_word(m, cid) &= ~BIT_U64(cid & 63);
+}
+
+static __always_inline bool __cmask_test_and_set(struct scx_cmask __arena *m, u32 cid)
+{
+ u64 bit = BIT_U64(cid & 63);
+ u64 __arena *w;
+ u64 prev;
+
+ if (!__cmask_contains(m, cid))
+ return false;
+ w = __cmask_word(m, cid);
+ prev = *w & bit;
+ *w |= bit;
+ return prev;
+}
+
+static __always_inline bool __cmask_test_and_clear(struct scx_cmask __arena *m, u32 cid)
+{
+ u64 bit = BIT_U64(cid & 63);
+ u64 __arena *w;
+ u64 prev;
+
+ if (!__cmask_contains(m, cid))
+ return false;
+ w = __cmask_word(m, cid);
+ prev = *w & bit;
+ *w &= ~bit;
+ return prev;
+}
+
+static __always_inline void cmask_zero(struct scx_cmask __arena *m)
+{
+ u32 nr_words = CMASK_NR_WORDS(m->nr_bits), i;
+
+ bpf_for(i, 0, CMASK_MAX_WORDS) {
+ if (i >= nr_words)
+ break;
+ m->bits[i] = 0;
+ }
+}
+
+/*
+ * BPF_-prefixed to avoid colliding with the kernel's anonymous CMASK_OP_*
+ * enum in ext_cid.c, which is exported via BTF and reachable through
+ * vmlinux.h.
+ */
+enum {
+ BPF_CMASK_OP_AND,
+ BPF_CMASK_OP_OR,
+ BPF_CMASK_OP_COPY,
+};
+
+static __always_inline void cmask_op_word(struct scx_cmask __arena *dst,
+ const struct scx_cmask __arena *src,
+ u32 di, u32 si, u64 mask, int op)
+{
+ u64 dv = dst->bits[di];
+ u64 sv = src->bits[si];
+ u64 rv;
+
+ if (op == BPF_CMASK_OP_AND)
+ rv = dv & sv;
+ else if (op == BPF_CMASK_OP_OR)
+ rv = dv | sv;
+ else
+ rv = sv;
+
+ dst->bits[di] = (dv & ~mask) | (rv & mask);
+}
+
+static __always_inline void cmask_op(struct scx_cmask __arena *dst,
+ const struct scx_cmask __arena *src, int op)
+{
+ u32 d_end = dst->base + dst->nr_bits;
+ u32 s_end = src->base + src->nr_bits;
+ u32 lo = dst->base > src->base ? dst->base : src->base;
+ u32 hi = d_end < s_end ? d_end : s_end;
+ u32 d_base = dst->base / 64;
+ u32 s_base = src->base / 64;
+ u32 lo_word, hi_word, i;
+ u64 head_mask, tail_mask;
+
+ if (lo >= hi)
+ return;
+
+ lo_word = lo / 64;
+ hi_word = (hi - 1) / 64;
+ head_mask = GENMASK_U64(63, lo & 63);
+ tail_mask = GENMASK_U64((hi - 1) & 63, 0);
+
+ bpf_for(i, 0, CMASK_MAX_WORDS) {
+ u32 w = lo_word + i;
+ u64 m;
+
+ if (w > hi_word)
+ break;
+
+ m = GENMASK_U64(63, 0);
+ if (w == lo_word)
+ m &= head_mask;
+ if (w == hi_word)
+ m &= tail_mask;
+
+ cmask_op_word(dst, src, w - d_base, w - s_base, m, op);
+ }
+}
+
+/*
+ * cmask_and/or/copy only modify @dst bits that lie in the intersection of
+ * [@dst->base, @dst->base + @dst->nr_bits) and [@src->base,
+ * @src->base + @src->nr_bits). Bits in @dst outside that window
+ * keep their prior values - in particular, cmask_copy() does NOT zero @dst
+ * bits that lie outside @src's range.
+ */
+static __always_inline void cmask_and(struct scx_cmask __arena *dst,
+ const struct scx_cmask __arena *src)
+{
+ cmask_op(dst, src, BPF_CMASK_OP_AND);
+}
+
+static __always_inline void cmask_or(struct scx_cmask __arena *dst,
+ const struct scx_cmask __arena *src)
+{
+ cmask_op(dst, src, BPF_CMASK_OP_OR);
+}
+
+static __always_inline void cmask_copy(struct scx_cmask __arena *dst,
+ const struct scx_cmask __arena *src)
+{
+ cmask_op(dst, src, BPF_CMASK_OP_COPY);
+}
+
+/**
+ * cmask_next_set - find the first set bit at or after @cid
+ * @m: cmask to search
+ * @cid: starting cid (clamped to @m->base if below)
+ *
+ * Returns the smallest set cid in [@cid, @m->base + @m->nr_bits), or
+ * @m->base + @m->nr_bits if none (the out-of-range sentinel matches the
+ * termination condition used by cmask_for_each()).
+ */
+static __always_inline u32 cmask_next_set(const struct scx_cmask __arena *m, u32 cid)
+{
+ u32 end = m->base + m->nr_bits;
+ u32 base = m->base / 64;
+ u32 last_wi = (end - 1) / 64 - base;
+ u32 start_wi, start_bit, i;
+
+ if (cid < m->base)
+ cid = m->base;
+ if (cid >= end)
+ return end;
+
+ start_wi = cid / 64 - base;
+ start_bit = cid & 63;
+
+ bpf_for(i, 0, CMASK_MAX_WORDS) {
+ u32 wi = start_wi + i;
+ u64 word;
+ u32 found;
+
+ if (wi > last_wi)
+ break;
+
+ word = m->bits[wi];
+ if (i == 0)
+ word &= GENMASK_U64(63, start_bit);
+ if (!word)
+ continue;
+
+ found = (base + wi) * 64 + __builtin_ctzll(word);
+ if (found >= end)
+ return end;
+ return found;
+ }
+ return end;
+}
+
+static __always_inline u32 cmask_first_set(const struct scx_cmask __arena *m)
+{
+ return cmask_next_set(m, m->base);
+}
+
+#define cmask_for_each(cid, m) \
+ for ((cid) = cmask_first_set(m); \
+ (cid) < (m)->base + (m)->nr_bits; \
+ (cid) = cmask_next_set((m), (cid) + 1))
+
+/*
+ * Population count over [base, base + nr_bits). Padding bits in the head/tail
+ * words are guaranteed zero by the mutating helpers, so a flat popcount over
+ * all words is correct.
+ */
+static __always_inline u32 cmask_weight(const struct scx_cmask __arena *m)
+{
+ u32 nr_words = CMASK_NR_WORDS(m->nr_bits), i;
+ u32 count = 0;
+
+ bpf_for(i, 0, CMASK_MAX_WORDS) {
+ if (i >= nr_words)
+ break;
+ count += __builtin_popcountll(m->bits[i]);
+ }
+ return count;
+}
+
+/*
+ * True if @a and @b share any set bit. Walk only the intersection of their
+ * ranges, matching the semantics of cmask_and().
+ */
+static __always_inline bool cmask_intersects(const struct scx_cmask __arena *a,
+ const struct scx_cmask __arena *b)
+{
+ u32 a_end = a->base + a->nr_bits;
+ u32 b_end = b->base + b->nr_bits;
+ u32 lo = a->base > b->base ? a->base : b->base;
+ u32 hi = a_end < b_end ? a_end : b_end;
+ u32 a_base = a->base / 64;
+ u32 b_base = b->base / 64;
+ u32 lo_word, hi_word, i;
+ u64 head_mask, tail_mask;
+
+ if (lo >= hi)
+ return false;
+
+ lo_word = lo / 64;
+ hi_word = (hi - 1) / 64;
+ head_mask = GENMASK_U64(63, lo & 63);
+ tail_mask = GENMASK_U64((hi - 1) & 63, 0);
+
+ bpf_for(i, 0, CMASK_MAX_WORDS) {
+ u32 w = lo_word + i;
+ u64 mask, av, bv;
+
+ if (w > hi_word)
+ break;
+
+ mask = GENMASK_U64(63, 0);
+ if (w == lo_word)
+ mask &= head_mask;
+ if (w == hi_word)
+ mask &= tail_mask;
+
+ av = a->bits[w - a_base] & mask;
+ bv = b->bits[w - b_base] & mask;
+ if (av & bv)
+ return true;
+ }
+ return false;
+}
+
+/*
+ * Find the next cid set in both @a and @b at or after @start, bounded by the
+ * intersection of the two ranges. Return a->base + a->nr_bits if none found.
+ *
+ * Building block for cmask_next_and_set_wrap(). Callers that want a bounded
+ * scan without wrap call this directly.
+ */
+static __always_inline u32 cmask_next_and_set(const struct scx_cmask __arena *a,
+ const struct scx_cmask __arena *b,
+ u32 start)
+{
+ u32 a_end = a->base + a->nr_bits;
+ u32 b_end = b->base + b->nr_bits;
+ u32 a_wbase = a->base / 64;
+ u32 b_wbase = b->base / 64;
+ u32 lo = a->base > b->base ? a->base : b->base;
+ u32 hi = a_end < b_end ? a_end : b_end;
+ u32 last_wi, start_wi, start_bit, i;
+
+ if (lo >= hi)
+ return a_end;
+ if (start < lo)
+ start = lo;
+ if (start >= hi)
+ return a_end;
+
+ last_wi = (hi - 1) / 64;
+ start_wi = start / 64;
+ start_bit = start & 63;
+
+ bpf_for(i, 0, CMASK_MAX_WORDS) {
+ u32 abs_wi = start_wi + i;
+ u64 word;
+ u32 found;
+
+ if (abs_wi > last_wi)
+ break;
+
+ word = a->bits[abs_wi - a_wbase] & b->bits[abs_wi - b_wbase];
+ if (i == 0)
+ word &= GENMASK_U64(63, start_bit);
+ if (!word)
+ continue;
+
+ found = abs_wi * 64 + __builtin_ctzll(word);
+ if (found >= hi)
+ return a_end;
+ return found;
+ }
+ return a_end;
+}
+
+/*
+ * Find the next set cid in @m at or after @start, wrapping to @m->base if no
+ * set bit is found in [start, m->base + m->nr_bits). Return m->base +
+ * m->nr_bits if @m is empty.
+ *
+ * Callers do round-robin distribution by passing (last_cid + 1) as @start.
+ */
+static __always_inline u32 cmask_next_set_wrap(const struct scx_cmask __arena *m,
+ u32 start)
+{
+ u32 end = m->base + m->nr_bits;
+ u32 found;
+
+ found = cmask_next_set(m, start);
+ if (found < end || start <= m->base)
+ return found;
+
+ found = cmask_next_set(m, m->base);
+ return found < start ? found : end;
+}
+
+/*
+ * Find the next cid set in both @a and @b at or after @start, wrapping to
+ * @a->base if none found in the forward half. Return a->base + a->nr_bits
+ * if the intersection is empty.
+ *
+ * Callers do round-robin distribution by passing (last_cid + 1) as @start.
+ */
+static __always_inline u32 cmask_next_and_set_wrap(const struct scx_cmask __arena *a,
+ const struct scx_cmask __arena *b,
+ u32 start)
+{
+ u32 a_end = a->base + a->nr_bits;
+ u32 found;
+
+ found = cmask_next_and_set(a, b, start);
+ if (found < a_end || start <= a->base)
+ return found;
+
+ found = cmask_next_and_set(a, b, a->base);
+ return found < start ? found : a_end;
+}
+
+/**
+ * cmask_from_cpumask - translate a kernel cpumask to a cid-space cmask
+ * @m: cmask to fill. Zeroed first; only bits within [@m->base, @m->base +
+ * @m->nr_bits) are updated - cpus mapping to cids outside that range
+ * are ignored.
+ * @cpumask: kernel cpumask to translate
+ *
+ * For each cpu in @cpumask, set the cpu's cid in @m. Caller must ensure
+ * @cpumask stays stable across the call (e.g. RCU read lock for
+ * task->cpus_ptr).
+ */
+static __always_inline void cmask_from_cpumask(struct scx_cmask __arena *m,
+ const struct cpumask *cpumask)
+{
+ u32 nr_cpu_ids = scx_bpf_nr_cpu_ids();
+ s32 cpu;
+
+ cmask_zero(m);
+ bpf_for(cpu, 0, nr_cpu_ids) {
+ s32 cid;
+
+ if (!bpf_cpumask_test_cpu(cpu, cpumask))
+ continue;
+ cid = scx_bpf_cpu_to_cid(cpu);
+ if (cid >= 0)
+ __cmask_set(m, cid);
+ }
+}
+
+/**
+ * cmask_copy_from_kernel - probe-read a kernel cmask into an arena cmask
+ * @dst: arena cmask to fill; must have @dst->base == 0 and be sized for @src.
+ * @src: kernel-memory cmask (e.g. ops.set_cmask() arg); @src->base must be 0.
+ *
+ * Word-for-word copy; @src and @dst must share base 0 alignment. Triggers
+ * scx_bpf_error() on probe failure or precondition violation.
+ */
+static __always_inline void cmask_copy_from_kernel(struct scx_cmask __arena *dst,
+ const struct scx_cmask *src)
+{
+ u32 nr_bits = 0, nr_words, dst_nr_words, wi;
+
+ if (dst->base != 0) {
+ scx_bpf_error("cmask_copy_from_kernel requires dst->base == 0");
+ return;
+ }
+
+ if (bpf_probe_read_kernel(&nr_bits, sizeof(nr_bits), &src->nr_bits)) {
+ scx_bpf_error("probe-read cmask->nr_bits failed");
+ return;
+ }
+
+ nr_words = CMASK_NR_WORDS(nr_bits);
+ dst_nr_words = CMASK_NR_WORDS(dst->nr_bits);
+ if (nr_words > dst_nr_words) {
+ scx_bpf_error("src cmask nr_bits=%u exceeds dst capacity",
+ nr_bits);
+ return;
+ }
+
+ cmask_zero(dst);
+ bpf_for(wi, 0, CMASK_MAX_WORDS) {
+ u64 word = 0;
+ if (wi >= nr_words)
+ break;
+ if (bpf_probe_read_kernel(&word, sizeof(u64), &src->bits[wi])) {
+ scx_bpf_error("probe-read cmask->bits[%u] failed", wi);
+ return;
+ }
+ dst->bits[wi] = word;
+ }
+}
+
+#endif /* __SCX_CID_BPF_H */
diff --git a/tools/sched_ext/include/scx/common.bpf.h b/tools/sched_ext/include/scx/common.bpf.h
index 4bf959a8cd08..3e353dfafb46 100644
--- a/tools/sched_ext/include/scx/common.bpf.h
+++ b/tools/sched_ext/include/scx/common.bpf.h
@@ -1055,5 +1055,6 @@ static inline u64 scx_clock_irq(u32 cpu)
#include "compat.bpf.h"
#include "enums.bpf.h"
+#include "cid.bpf.h"
#endif /* __SCX_COMMON_BPF_H */
--
2.53.0
^ permalink raw reply related [flat|nested] 3+ messages in thread* [PATCHSET v2 REPOST sched_ext/for-7.2] sched_ext: Topological CPU IDs and cid-form struct_ops
@ 2026-04-24 17:27 Tejun Heo
2026-04-24 17:27 ` [PATCH 10/17] sched_ext: Add cmask, a base-windowed bitmap over cid space Tejun Heo
0 siblings, 1 reply; 3+ messages in thread
From: Tejun Heo @ 2026-04-24 17:27 UTC (permalink / raw)
To: David Vernet, Andrea Righi, Changwoo Min
Cc: sched-ext, emil, linux-kernel, Cheng-Yang Chou, Zhao Mengmeng,
Tejun Heo
Hello,
Reposting v2 because the original send was not properly threaded -
each patch went out as a standalone top-level message. Content is
unchanged from the original v2.
Original v2: https://lore.kernel.org/r/20260424013220.2923402-1-tj@kernel.org
v2 of https://lore.kernel.org/r/20260421071945.3110084-1-tj@kernel.org
v2:
- Add ext-types.h first patch for early subsystem-wide type defs.
- cid: publish the cid tables with WRITE_ONCE / read with READ_ONCE;
document the visibility contract.
- cid-kfuncs: NULL-guard scx_bpf_this_cid / scx_bpf_task_cid for
TRACING/SYSCALL callers before any SCX sched has enabled.
- cid-struct-ops: use struct_size() for the set_cmask_scratch percpu
alloc; cluster __scx_is_cid_type disable with __scx_enabled disable
in scx_root_disable().
- cid-kfunc-filter: sync per-entry kfunc flags with each kfunc's
primary BTF_ID_FLAGS() declaration (Zhao). pahole intersects flags
across occurrences; omitting them drops the flags globally - the
visible symptom was KF_IMPLICIT_ARGS getting cleared on
scx_bpf_kick_cpu, leaking bpf_prog_aux into vmlinux.h.
- cmask: narrow to the helpers this series actually uses;
cmask_copy_from_kernel contract and runtime guard.
This patchset introduces topological CPU IDs (cids) - dense,
topology-ordered cpu identifiers - and an alternative cid-form struct_ops
type that lets BPF schedulers operate in cid space directly.
Key pieces:
- cid space: scx_cid_init() walks nodes * LLCs * cores * threads and packs
a dense cid mapping. The mapping can be overridden via
scx_bpf_cid_override(). See "Topological CPU IDs" in ext_cid.h for the
model.
- cmask: a base-windowed bitmap over cid space. Kernel and BPF helpers with
identical semantics. Used by scx_qmap for per-task affinity and idle-cid
tracking; meant to be the substrate for sub-sched cid allocation.
- bpf_sched_ext_ops_cid: a parallel struct_ops type whose callbacks take
cids/cmasks instead of cpus/cpumasks. Kernel translates at the boundary
via scx_cpu_arg() / scx_cpu_ret(); the two struct types share offsets up
through @priv (verified by BUILD_BUG_ON) so the union view in scx_sched
works without function-pointer casts. Sub-sched support is tied to
cid-form: validate_ops() rejects cpu-form sub-scheds and cpu-form roots
that expose sub_attach / sub_detach.
- cid-form kfuncs: scx_bpf_kick_cid, scx_bpf_cidperf_{cap,cur,set},
scx_bpf_cid_curr, scx_bpf_task_cid, scx_bpf_this_cid,
scx_bpf_nr_{cids,online_cids}, scx_bpf_cid_to_cpu, scx_bpf_cpu_to_cid.
A cid-form program may not call cpu-only kfuncs (enforced at verifier
load via scx_kfunc_context_filter); the reverse is intentionally
permissive to ease migration.
- scx_qmap port: scx_qmap is converted to cid-form. It uses the cmask-based
idle picker, per-task cid-space cpus_allowed, and cid-form kfuncs
throughout. Sub-sched dispatching via scx_bpf_sub_dispatch() continues to
work.
v2 re-tested on the 16-cpu QEMU: cid-form scx_qmap, cpu-form scx_simple,
cid<->cpu cycling, scx_qmap under stress-ng, hotplug auto-restart, and
sub-sched (root scx_qmap + cgroup-scoped scx_qmap child). Clean.
Based on sched_ext/for-7.2 (c2929bc21dce).
0001-sched_ext-Add-ext_types.h-for-early-subsystem-wide-d.patch
0002-sched_ext-Rename-ops_cpu_valid-to-scx_cpu_valid-and-.patch
0003-sched_ext-Move-scx_exit-scx_error-and-friends-to-ext.patch
0004-sched_ext-Shift-scx_kick_cpu-validity-check-to-scx_b.patch
0005-sched_ext-Relocate-cpu_acquire-cpu_release-to-end-of.patch
0006-sched_ext-Make-scx_enable-take-scx_enable_cmd.patch
0007-sched_ext-Add-topological-CPU-IDs-cids.patch
0008-sched_ext-Add-scx_bpf_cid_override-kfunc.patch
0009-tools-sched_ext-Add-struct_size-helpers-to-common.bp.patch
0010-sched_ext-Add-cmask-a-base-windowed-bitmap-over-cid-.patch
0011-sched_ext-Add-cid-form-kfunc-wrappers-alongside-cpu-.patch
0012-sched_ext-Add-bpf_sched_ext_ops_cid-struct_ops-type.patch
0013-sched_ext-Forbid-cpu-form-kfuncs-from-cid-form-sched.patch
0014-tools-sched_ext-scx_qmap-Restart-on-hotplug-instead-.patch
0015-tools-sched_ext-scx_qmap-Add-cmask-based-idle-tracki.patch
0016-tools-sched_ext-scx_qmap-Port-to-cid-form-struct_ops.patch
0017-sched_ext-Require-cid-form-struct_ops-for-sub-sched-.patch
Git tree: git://git.kernel.org/pub/scm/linux/kernel/git/tj/sched_ext.git scx-cid-v2
kernel/sched/build_policy.c | 2 +
kernel/sched/ext.c | 650 +++++++++++++++++++++++++----
kernel/sched/ext_cid.c | 417 ++++++++++++++++++++
kernel/sched/ext_cid.h | 164 ++++++++
kernel/sched/ext_idle.c | 8 +-
kernel/sched/ext_internal.h | 203 +++++++---
kernel/sched/ext_types.h | 104 +++++
tools/sched_ext/include/scx/cid.bpf.h | 597 ++++++++++++++++++++++++++++
tools/sched_ext/include/scx/common.bpf.h | 23 ++
tools/sched_ext/include/scx/compat.bpf.h | 24 ++
tools/sched_ext/scx_qmap.bpf.c | 306 ++++++++-------
tools/sched_ext/scx_qmap.c | 25 +-
tools/sched_ext/scx_qmap.h | 2 +-
13 files changed, 2240 insertions(+), 285 deletions(-)
--
tejun
^ permalink raw reply [flat|nested] 3+ messages in thread* [PATCH 10/17] sched_ext: Add cmask, a base-windowed bitmap over cid space
2026-04-24 17:27 [PATCHSET v2 REPOST sched_ext/for-7.2] sched_ext: Topological CPU IDs and cid-form struct_ops Tejun Heo
@ 2026-04-24 17:27 ` Tejun Heo
0 siblings, 0 replies; 3+ messages in thread
From: Tejun Heo @ 2026-04-24 17:27 UTC (permalink / raw)
To: David Vernet, Andrea Righi, Changwoo Min
Cc: sched-ext, emil, linux-kernel, Cheng-Yang Chou, Zhao Mengmeng,
Tejun Heo
Sub-scheduler code built on cids needs bitmaps scoped to a slice of cid
space (e.g. the idle cids of a shard). A cpumask sized for NR_CPUS wastes
most of its bits for a small window and is awkward in BPF.
scx_cmask covers [base, base + nr_bits). bits[] is aligned to the global
64-cid grid: bits[0] spans [base & ~63, (base & ~63) + 64). Any two
cmasks therefore address bits[] against the same global windows, so
cross-cmask word ops reduce to
dest->bits[i] OP= operand->bits[i - delta]
with no bit-shifting, at the cost of up to one extra storage word for
head misalignment. This alignment guarantee is the reason binary ops
can stay word-level; every mutating helper preserves it.
Kernel side in ext_cid.[hc]; BPF side in tools/sched_ext/include/scx/
cid.bpf.h. BPF side drops the scx_ prefix (redundant in BPF code) and
adds the extra helpers that basic idle-cpu selection needs.
No callers yet.
v2: Narrow to helpers that will be used in the planned changes;
set/bit/find/zero ops will be added as usage develops.
Signed-off-by: Tejun Heo <tj@kernel.org>
Reviewed-by: Cheng-Yang Chou <yphbchou0911@gmail.com>
---
kernel/sched/ext_cid.h | 25 +
kernel/sched/ext_types.h | 38 ++
tools/sched_ext/include/scx/cid.bpf.h | 597 +++++++++++++++++++++++
tools/sched_ext/include/scx/common.bpf.h | 1 +
4 files changed, 661 insertions(+)
create mode 100644 tools/sched_ext/include/scx/cid.bpf.h
diff --git a/kernel/sched/ext_cid.h b/kernel/sched/ext_cid.h
index 52edb66b53fd..c3c429d2c8e2 100644
--- a/kernel/sched/ext_cid.h
+++ b/kernel/sched/ext_cid.h
@@ -127,4 +127,29 @@ static inline s32 scx_cpu_to_cid(struct scx_sched *sch, s32 cpu)
return __scx_cpu_to_cid(cpu);
}
+static inline bool __scx_cmask_contains(const struct scx_cmask *m, u32 cid)
+{
+ return likely(cid >= m->base && cid < m->base + m->nr_bits);
+}
+
+/* Word in bits[] covering @cid. @cid must satisfy __scx_cmask_contains(). */
+static inline u64 *__scx_cmask_word(const struct scx_cmask *m, u32 cid)
+{
+ return (u64 *)&m->bits[cid / 64 - m->base / 64];
+}
+
+static inline void scx_cmask_init(struct scx_cmask *m, u32 base, u32 nr_bits)
+{
+ m->base = base;
+ m->nr_bits = nr_bits;
+ memset(m->bits, 0, SCX_CMASK_NR_WORDS(nr_bits) * sizeof(u64));
+}
+
+static inline void __scx_cmask_set(struct scx_cmask *m, u32 cid)
+{
+ if (!__scx_cmask_contains(m, cid))
+ return;
+ *__scx_cmask_word(m, cid) |= BIT_U64(cid & 63);
+}
+
#endif /* _KERNEL_SCHED_EXT_CID_H */
diff --git a/kernel/sched/ext_types.h b/kernel/sched/ext_types.h
index be4d3565ae8d..ebb8cdf90612 100644
--- a/kernel/sched/ext_types.h
+++ b/kernel/sched/ext_types.h
@@ -63,4 +63,42 @@ struct scx_cid_topo {
s32 node_idx;
};
+/*
+ * cmask: variable-length, base-windowed bitmap over cid space
+ * -----------------------------------------------------------
+ *
+ * A cmask covers the cid range [base, base + nr_bits). bits[] is aligned to the
+ * global 64-cid grid: bits[0] spans [base & ~63, (base & ~63) + 64), so the
+ * first (base & 63) bits of bits[0] are head padding and any tail past base +
+ * nr_bits is tail padding. Both must stay zero for the lifetime of the mask;
+ * all mutating helpers preserve that invariant.
+ *
+ * Grid alignment means two cmasks always address bits[] against the same global
+ * 64-cid windows, so cross-cmask word ops (AND, OR, ...) reduce to
+ *
+ * dst->bits[i] OP= src->bits[i - delta]
+ *
+ * with no bit-shifting, regardless of how the two bases relate mod 64.
+ */
+struct scx_cmask {
+ u32 base;
+ u32 nr_bits;
+ DECLARE_FLEX_ARRAY(u64, bits);
+};
+
+/*
+ * Number of u64 words of bits[] storage that covers @nr_bits regardless of base
+ * alignment. The +1 absorbs up to 63 bits of head padding when base is not
+ * 64-aligned - always allocating one extra word beats branching on base or
+ * splitting the compute.
+ */
+#define SCX_CMASK_NR_WORDS(nr_bits) (((nr_bits) + 63) / 64 + 1)
+
+/*
+ * Define an on-stack cmask for up to @cap_bits. @name is a struct scx_cmask *
+ * aliasing zero-initialized storage; call scx_cmask_init() to set base/nr_bits.
+ */
+#define SCX_CMASK_DEFINE(name, cap_bits) \
+ DEFINE_RAW_FLEX(struct scx_cmask, name, bits, SCX_CMASK_NR_WORDS(cap_bits))
+
#endif /* _KERNEL_SCHED_EXT_TYPES_H */
diff --git a/tools/sched_ext/include/scx/cid.bpf.h b/tools/sched_ext/include/scx/cid.bpf.h
new file mode 100644
index 000000000000..9b6c757d2f97
--- /dev/null
+++ b/tools/sched_ext/include/scx/cid.bpf.h
@@ -0,0 +1,597 @@
+/* SPDX-License-Identifier: GPL-2.0 */
+/*
+ * BPF-side helpers for cids and cmasks. See kernel/sched/ext_cid.h for the
+ * authoritative layout and semantics. The BPF-side helpers use the cmask_*
+ * naming (no scx_ prefix); cmask is the SCX bitmap type so the prefix is
+ * redundant in BPF code. Atomics use __sync_val_compare_and_swap and every
+ * helper is inline (no .c counterpart).
+ *
+ * Included by scx/common.bpf.h; don't include directly.
+ *
+ * Copyright (c) 2026 Meta Platforms, Inc. and affiliates.
+ * Copyright (c) 2026 Tejun Heo <tj@kernel.org>
+ */
+#ifndef __SCX_CID_BPF_H
+#define __SCX_CID_BPF_H
+
+#include "bpf_arena_common.bpf.h"
+
+#ifndef BIT_U64
+#define BIT_U64(nr) (1ULL << (nr))
+#endif
+#ifndef GENMASK_U64
+#define GENMASK_U64(h, l) ((~0ULL << (l)) & (~0ULL >> (63 - (h))))
+#endif
+
+/*
+ * Storage cap for bounded loops over bits[]. Sized to cover NR_CPUS=8192 with
+ * one extra word for head-misalignment. Increase if deployment targets larger
+ * NR_CPUS.
+ */
+#ifndef CMASK_MAX_WORDS
+#define CMASK_MAX_WORDS 129
+#endif
+
+#define CMASK_NR_WORDS(nr_bits) (((nr_bits) + 63) / 64 + 1)
+
+static __always_inline bool __cmask_contains(const struct scx_cmask __arena *m, u32 cid)
+{
+ return cid >= m->base && cid < m->base + m->nr_bits;
+}
+
+static __always_inline u64 __arena *__cmask_word(const struct scx_cmask __arena *m, u32 cid)
+{
+ return (u64 __arena *)&m->bits[cid / 64 - m->base / 64];
+}
+
+static __always_inline void cmask_init(struct scx_cmask __arena *m, u32 base, u32 nr_bits)
+{
+ u32 nr_words = CMASK_NR_WORDS(nr_bits), i;
+
+ m->base = base;
+ m->nr_bits = nr_bits;
+
+ bpf_for(i, 0, CMASK_MAX_WORDS) {
+ if (i >= nr_words)
+ break;
+ m->bits[i] = 0;
+ }
+}
+
+static __always_inline bool cmask_test(const struct scx_cmask __arena *m, u32 cid)
+{
+ if (!__cmask_contains(m, cid))
+ return false;
+ return *__cmask_word(m, cid) & BIT_U64(cid & 63);
+}
+
+/*
+ * x86 BPF JIT rejects BPF_OR | BPF_FETCH and BPF_AND | BPF_FETCH on arena
+ * pointers (see bpf_jit_supports_insn() in arch/x86/net/bpf_jit_comp.c). Only
+ * BPF_CMPXCHG / BPF_XCHG / BPF_ADD with FETCH are allowed. Implement
+ * test_and_{set,clear} and the atomic set/clear via a cmpxchg loop.
+ *
+ * CMASK_CAS_TRIES is far above what any non-pathological contention needs.
+ * Exhausting it means the bit update was lost, which corrupts the caller's view
+ * of the bitmap, so raise scx_bpf_error() to abort the scheduler.
+ */
+#define CMASK_CAS_TRIES 1024
+
+static __always_inline void cmask_set(struct scx_cmask __arena *m, u32 cid)
+{
+ u64 __arena *w;
+ u64 bit, old, new;
+ u32 i;
+
+ if (!__cmask_contains(m, cid))
+ return;
+ w = __cmask_word(m, cid);
+ bit = BIT_U64(cid & 63);
+ bpf_for(i, 0, CMASK_CAS_TRIES) {
+ old = *w;
+ if (old & bit)
+ return;
+ new = old | bit;
+ if (__sync_val_compare_and_swap(w, old, new) == old)
+ return;
+ }
+ scx_bpf_error("cmask_set CAS exhausted at cid %u", cid);
+}
+
+static __always_inline void cmask_clear(struct scx_cmask __arena *m, u32 cid)
+{
+ u64 __arena *w;
+ u64 bit, old, new;
+ u32 i;
+
+ if (!__cmask_contains(m, cid))
+ return;
+ w = __cmask_word(m, cid);
+ bit = BIT_U64(cid & 63);
+ bpf_for(i, 0, CMASK_CAS_TRIES) {
+ old = *w;
+ if (!(old & bit))
+ return;
+ new = old & ~bit;
+ if (__sync_val_compare_and_swap(w, old, new) == old)
+ return;
+ }
+ scx_bpf_error("cmask_clear CAS exhausted at cid %u", cid);
+}
+
+static __always_inline bool cmask_test_and_set(struct scx_cmask __arena *m, u32 cid)
+{
+ u64 __arena *w;
+ u64 bit, old, new;
+ u32 i;
+
+ if (!__cmask_contains(m, cid))
+ return false;
+ w = __cmask_word(m, cid);
+ bit = BIT_U64(cid & 63);
+ bpf_for(i, 0, CMASK_CAS_TRIES) {
+ old = *w;
+ if (old & bit)
+ return true;
+ new = old | bit;
+ if (__sync_val_compare_and_swap(w, old, new) == old)
+ return false;
+ }
+ scx_bpf_error("cmask_test_and_set CAS exhausted at cid %u", cid);
+ return false;
+}
+
+static __always_inline bool cmask_test_and_clear(struct scx_cmask __arena *m, u32 cid)
+{
+ u64 __arena *w;
+ u64 bit, old, new;
+ u32 i;
+
+ if (!__cmask_contains(m, cid))
+ return false;
+ w = __cmask_word(m, cid);
+ bit = BIT_U64(cid & 63);
+ bpf_for(i, 0, CMASK_CAS_TRIES) {
+ old = *w;
+ if (!(old & bit))
+ return false;
+ new = old & ~bit;
+ if (__sync_val_compare_and_swap(w, old, new) == old)
+ return true;
+ }
+ scx_bpf_error("cmask_test_and_clear CAS exhausted at cid %u", cid);
+ return false;
+}
+
+static __always_inline void __cmask_set(struct scx_cmask __arena *m, u32 cid)
+{
+ if (!__cmask_contains(m, cid))
+ return;
+ *__cmask_word(m, cid) |= BIT_U64(cid & 63);
+}
+
+static __always_inline void __cmask_clear(struct scx_cmask __arena *m, u32 cid)
+{
+ if (!__cmask_contains(m, cid))
+ return;
+ *__cmask_word(m, cid) &= ~BIT_U64(cid & 63);
+}
+
+static __always_inline bool __cmask_test_and_set(struct scx_cmask __arena *m, u32 cid)
+{
+ u64 bit = BIT_U64(cid & 63);
+ u64 __arena *w;
+ u64 prev;
+
+ if (!__cmask_contains(m, cid))
+ return false;
+ w = __cmask_word(m, cid);
+ prev = *w & bit;
+ *w |= bit;
+ return prev;
+}
+
+static __always_inline bool __cmask_test_and_clear(struct scx_cmask __arena *m, u32 cid)
+{
+ u64 bit = BIT_U64(cid & 63);
+ u64 __arena *w;
+ u64 prev;
+
+ if (!__cmask_contains(m, cid))
+ return false;
+ w = __cmask_word(m, cid);
+ prev = *w & bit;
+ *w &= ~bit;
+ return prev;
+}
+
+static __always_inline void cmask_zero(struct scx_cmask __arena *m)
+{
+ u32 nr_words = CMASK_NR_WORDS(m->nr_bits), i;
+
+ bpf_for(i, 0, CMASK_MAX_WORDS) {
+ if (i >= nr_words)
+ break;
+ m->bits[i] = 0;
+ }
+}
+
+/*
+ * BPF_-prefixed to avoid colliding with the kernel's anonymous CMASK_OP_*
+ * enum in ext_cid.c, which is exported via BTF and reachable through
+ * vmlinux.h.
+ */
+enum {
+ BPF_CMASK_OP_AND,
+ BPF_CMASK_OP_OR,
+ BPF_CMASK_OP_COPY,
+};
+
+static __always_inline void cmask_op_word(struct scx_cmask __arena *dst,
+ const struct scx_cmask __arena *src,
+ u32 di, u32 si, u64 mask, int op)
+{
+ u64 dv = dst->bits[di];
+ u64 sv = src->bits[si];
+ u64 rv;
+
+ if (op == BPF_CMASK_OP_AND)
+ rv = dv & sv;
+ else if (op == BPF_CMASK_OP_OR)
+ rv = dv | sv;
+ else
+ rv = sv;
+
+ dst->bits[di] = (dv & ~mask) | (rv & mask);
+}
+
+static __always_inline void cmask_op(struct scx_cmask __arena *dst,
+ const struct scx_cmask __arena *src, int op)
+{
+ u32 d_end = dst->base + dst->nr_bits;
+ u32 s_end = src->base + src->nr_bits;
+ u32 lo = dst->base > src->base ? dst->base : src->base;
+ u32 hi = d_end < s_end ? d_end : s_end;
+ u32 d_base = dst->base / 64;
+ u32 s_base = src->base / 64;
+ u32 lo_word, hi_word, i;
+ u64 head_mask, tail_mask;
+
+ if (lo >= hi)
+ return;
+
+ lo_word = lo / 64;
+ hi_word = (hi - 1) / 64;
+ head_mask = GENMASK_U64(63, lo & 63);
+ tail_mask = GENMASK_U64((hi - 1) & 63, 0);
+
+ bpf_for(i, 0, CMASK_MAX_WORDS) {
+ u32 w = lo_word + i;
+ u64 m;
+
+ if (w > hi_word)
+ break;
+
+ m = GENMASK_U64(63, 0);
+ if (w == lo_word)
+ m &= head_mask;
+ if (w == hi_word)
+ m &= tail_mask;
+
+ cmask_op_word(dst, src, w - d_base, w - s_base, m, op);
+ }
+}
+
+/*
+ * cmask_and/or/copy only modify @dst bits that lie in the intersection of
+ * [@dst->base, @dst->base + @dst->nr_bits) and [@src->base,
+ * @src->base + @src->nr_bits). Bits in @dst outside that window
+ * keep their prior values - in particular, cmask_copy() does NOT zero @dst
+ * bits that lie outside @src's range.
+ */
+static __always_inline void cmask_and(struct scx_cmask __arena *dst,
+ const struct scx_cmask __arena *src)
+{
+ cmask_op(dst, src, BPF_CMASK_OP_AND);
+}
+
+static __always_inline void cmask_or(struct scx_cmask __arena *dst,
+ const struct scx_cmask __arena *src)
+{
+ cmask_op(dst, src, BPF_CMASK_OP_OR);
+}
+
+static __always_inline void cmask_copy(struct scx_cmask __arena *dst,
+ const struct scx_cmask __arena *src)
+{
+ cmask_op(dst, src, BPF_CMASK_OP_COPY);
+}
+
+/**
+ * cmask_next_set - find the first set bit at or after @cid
+ * @m: cmask to search
+ * @cid: starting cid (clamped to @m->base if below)
+ *
+ * Returns the smallest set cid in [@cid, @m->base + @m->nr_bits), or
+ * @m->base + @m->nr_bits if none (the out-of-range sentinel matches the
+ * termination condition used by cmask_for_each()).
+ */
+static __always_inline u32 cmask_next_set(const struct scx_cmask __arena *m, u32 cid)
+{
+ u32 end = m->base + m->nr_bits;
+ u32 base = m->base / 64;
+ u32 last_wi = (end - 1) / 64 - base;
+ u32 start_wi, start_bit, i;
+
+ if (cid < m->base)
+ cid = m->base;
+ if (cid >= end)
+ return end;
+
+ start_wi = cid / 64 - base;
+ start_bit = cid & 63;
+
+ bpf_for(i, 0, CMASK_MAX_WORDS) {
+ u32 wi = start_wi + i;
+ u64 word;
+ u32 found;
+
+ if (wi > last_wi)
+ break;
+
+ word = m->bits[wi];
+ if (i == 0)
+ word &= GENMASK_U64(63, start_bit);
+ if (!word)
+ continue;
+
+ found = (base + wi) * 64 + __builtin_ctzll(word);
+ if (found >= end)
+ return end;
+ return found;
+ }
+ return end;
+}
+
+static __always_inline u32 cmask_first_set(const struct scx_cmask __arena *m)
+{
+ return cmask_next_set(m, m->base);
+}
+
+#define cmask_for_each(cid, m) \
+ for ((cid) = cmask_first_set(m); \
+ (cid) < (m)->base + (m)->nr_bits; \
+ (cid) = cmask_next_set((m), (cid) + 1))
+
+/*
+ * Population count over [base, base + nr_bits). Padding bits in the head/tail
+ * words are guaranteed zero by the mutating helpers, so a flat popcount over
+ * all words is correct.
+ */
+static __always_inline u32 cmask_weight(const struct scx_cmask __arena *m)
+{
+ u32 nr_words = CMASK_NR_WORDS(m->nr_bits), i;
+ u32 count = 0;
+
+ bpf_for(i, 0, CMASK_MAX_WORDS) {
+ if (i >= nr_words)
+ break;
+ count += __builtin_popcountll(m->bits[i]);
+ }
+ return count;
+}
+
+/*
+ * True if @a and @b share any set bit. Walk only the intersection of their
+ * ranges, matching the semantics of cmask_and().
+ */
+static __always_inline bool cmask_intersects(const struct scx_cmask __arena *a,
+ const struct scx_cmask __arena *b)
+{
+ u32 a_end = a->base + a->nr_bits;
+ u32 b_end = b->base + b->nr_bits;
+ u32 lo = a->base > b->base ? a->base : b->base;
+ u32 hi = a_end < b_end ? a_end : b_end;
+ u32 a_base = a->base / 64;
+ u32 b_base = b->base / 64;
+ u32 lo_word, hi_word, i;
+ u64 head_mask, tail_mask;
+
+ if (lo >= hi)
+ return false;
+
+ lo_word = lo / 64;
+ hi_word = (hi - 1) / 64;
+ head_mask = GENMASK_U64(63, lo & 63);
+ tail_mask = GENMASK_U64((hi - 1) & 63, 0);
+
+ bpf_for(i, 0, CMASK_MAX_WORDS) {
+ u32 w = lo_word + i;
+ u64 mask, av, bv;
+
+ if (w > hi_word)
+ break;
+
+ mask = GENMASK_U64(63, 0);
+ if (w == lo_word)
+ mask &= head_mask;
+ if (w == hi_word)
+ mask &= tail_mask;
+
+ av = a->bits[w - a_base] & mask;
+ bv = b->bits[w - b_base] & mask;
+ if (av & bv)
+ return true;
+ }
+ return false;
+}
+
+/*
+ * Find the next cid set in both @a and @b at or after @start, bounded by the
+ * intersection of the two ranges. Return a->base + a->nr_bits if none found.
+ *
+ * Building block for cmask_next_and_set_wrap(). Callers that want a bounded
+ * scan without wrap call this directly.
+ */
+static __always_inline u32 cmask_next_and_set(const struct scx_cmask __arena *a,
+ const struct scx_cmask __arena *b,
+ u32 start)
+{
+ u32 a_end = a->base + a->nr_bits;
+ u32 b_end = b->base + b->nr_bits;
+ u32 a_wbase = a->base / 64;
+ u32 b_wbase = b->base / 64;
+ u32 lo = a->base > b->base ? a->base : b->base;
+ u32 hi = a_end < b_end ? a_end : b_end;
+ u32 last_wi, start_wi, start_bit, i;
+
+ if (lo >= hi)
+ return a_end;
+ if (start < lo)
+ start = lo;
+ if (start >= hi)
+ return a_end;
+
+ last_wi = (hi - 1) / 64;
+ start_wi = start / 64;
+ start_bit = start & 63;
+
+ bpf_for(i, 0, CMASK_MAX_WORDS) {
+ u32 abs_wi = start_wi + i;
+ u64 word;
+ u32 found;
+
+ if (abs_wi > last_wi)
+ break;
+
+ word = a->bits[abs_wi - a_wbase] & b->bits[abs_wi - b_wbase];
+ if (i == 0)
+ word &= GENMASK_U64(63, start_bit);
+ if (!word)
+ continue;
+
+ found = abs_wi * 64 + __builtin_ctzll(word);
+ if (found >= hi)
+ return a_end;
+ return found;
+ }
+ return a_end;
+}
+
+/*
+ * Find the next set cid in @m at or after @start, wrapping to @m->base if no
+ * set bit is found in [start, m->base + m->nr_bits). Return m->base +
+ * m->nr_bits if @m is empty.
+ *
+ * Callers do round-robin distribution by passing (last_cid + 1) as @start.
+ */
+static __always_inline u32 cmask_next_set_wrap(const struct scx_cmask __arena *m,
+ u32 start)
+{
+ u32 end = m->base + m->nr_bits;
+ u32 found;
+
+ found = cmask_next_set(m, start);
+ if (found < end || start <= m->base)
+ return found;
+
+ found = cmask_next_set(m, m->base);
+ return found < start ? found : end;
+}
+
+/*
+ * Find the next cid set in both @a and @b at or after @start, wrapping to
+ * @a->base if none found in the forward half. Return a->base + a->nr_bits
+ * if the intersection is empty.
+ *
+ * Callers do round-robin distribution by passing (last_cid + 1) as @start.
+ */
+static __always_inline u32 cmask_next_and_set_wrap(const struct scx_cmask __arena *a,
+ const struct scx_cmask __arena *b,
+ u32 start)
+{
+ u32 a_end = a->base + a->nr_bits;
+ u32 found;
+
+ found = cmask_next_and_set(a, b, start);
+ if (found < a_end || start <= a->base)
+ return found;
+
+ found = cmask_next_and_set(a, b, a->base);
+ return found < start ? found : a_end;
+}
+
+/**
+ * cmask_from_cpumask - translate a kernel cpumask to a cid-space cmask
+ * @m: cmask to fill. Zeroed first; only bits within [@m->base, @m->base +
+ * @m->nr_bits) are updated - cpus mapping to cids outside that range
+ * are ignored.
+ * @cpumask: kernel cpumask to translate
+ *
+ * For each cpu in @cpumask, set the cpu's cid in @m. Caller must ensure
+ * @cpumask stays stable across the call (e.g. RCU read lock for
+ * task->cpus_ptr).
+ */
+static __always_inline void cmask_from_cpumask(struct scx_cmask __arena *m,
+ const struct cpumask *cpumask)
+{
+ u32 nr_cpu_ids = scx_bpf_nr_cpu_ids();
+ s32 cpu;
+
+ cmask_zero(m);
+ bpf_for(cpu, 0, nr_cpu_ids) {
+ s32 cid;
+
+ if (!bpf_cpumask_test_cpu(cpu, cpumask))
+ continue;
+ cid = scx_bpf_cpu_to_cid(cpu);
+ if (cid >= 0)
+ __cmask_set(m, cid);
+ }
+}
+
+/**
+ * cmask_copy_from_kernel - probe-read a kernel cmask into an arena cmask
+ * @dst: arena cmask to fill; must have @dst->base == 0 and be sized for @src.
+ * @src: kernel-memory cmask (e.g. ops.set_cmask() arg); @src->base must be 0.
+ *
+ * Word-for-word copy; @src and @dst must share base 0 alignment. Triggers
+ * scx_bpf_error() on probe failure or precondition violation.
+ */
+static __always_inline void cmask_copy_from_kernel(struct scx_cmask __arena *dst,
+ const struct scx_cmask *src)
+{
+ u32 nr_bits = 0, nr_words, dst_nr_words, wi;
+
+ if (dst->base != 0) {
+ scx_bpf_error("cmask_copy_from_kernel requires dst->base == 0");
+ return;
+ }
+
+ if (bpf_probe_read_kernel(&nr_bits, sizeof(nr_bits), &src->nr_bits)) {
+ scx_bpf_error("probe-read cmask->nr_bits failed");
+ return;
+ }
+
+ nr_words = CMASK_NR_WORDS(nr_bits);
+ dst_nr_words = CMASK_NR_WORDS(dst->nr_bits);
+ if (nr_words > dst_nr_words) {
+ scx_bpf_error("src cmask nr_bits=%u exceeds dst capacity",
+ nr_bits);
+ return;
+ }
+
+ cmask_zero(dst);
+ bpf_for(wi, 0, CMASK_MAX_WORDS) {
+ u64 word = 0;
+ if (wi >= nr_words)
+ break;
+ if (bpf_probe_read_kernel(&word, sizeof(u64), &src->bits[wi])) {
+ scx_bpf_error("probe-read cmask->bits[%u] failed", wi);
+ return;
+ }
+ dst->bits[wi] = word;
+ }
+}
+
+#endif /* __SCX_CID_BPF_H */
diff --git a/tools/sched_ext/include/scx/common.bpf.h b/tools/sched_ext/include/scx/common.bpf.h
index 4bf959a8cd08..3e353dfafb46 100644
--- a/tools/sched_ext/include/scx/common.bpf.h
+++ b/tools/sched_ext/include/scx/common.bpf.h
@@ -1055,5 +1055,6 @@ static inline u64 scx_clock_irq(u32 cpu)
#include "compat.bpf.h"
#include "enums.bpf.h"
+#include "cid.bpf.h"
#endif /* __SCX_COMMON_BPF_H */
--
2.53.0
^ permalink raw reply related [flat|nested] 3+ messages in thread
* [PATCHSET v3 sched_ext/for-7.2] sched_ext: Topological CPU IDs and cid-form struct_ops
@ 2026-04-28 20:35 Tejun Heo
2026-04-28 20:35 ` [PATCH 10/17] sched_ext: Add cmask, a base-windowed bitmap over cid space Tejun Heo
0 siblings, 1 reply; 3+ messages in thread
From: Tejun Heo @ 2026-04-28 20:35 UTC (permalink / raw)
To: David Vernet, Andrea Righi, Changwoo Min
Cc: sched-ext, Emil Tsalapatis, linux-kernel, Tejun Heo
Hello,
v3 (all from the Sashiko AI review at
https://sashiko.dev/#/patchset/20260424172721.3458520-1-tj%40kernel.org):
- cid: drop leaked cpus_read_lock() on scx_cid_init() failure;
BUILD_BUG_ON tightened to NR_CPUS<=8192 to match the BPF cmask
helpers' CMASK_MAX_WORDS coverage.
- bpf-struct-size: use offsetof() in struct_size() to match the
kernel <linux/overflow.h> macro semantics (no inflation from
trailing struct padding).
- cmask: cmask_copy_from_kernel() validates src->base==0 via
probe-read; nr_bits check is bit-level rather than rounded-up
word-count.
- cid-qmap-idle: qmap_init() refuses to load when scx_bpf_nr_cids()
exceeds SCX_QMAP_MAX_CPUS; the task_ctx flex array would otherwise
overflow into the next slab entry.
v2: https://lore.kernel.org/r/20260424172721.3458520-1-tj@kernel.org
v1: https://lore.kernel.org/r/20260421071945.3110084-1-tj@kernel.org
This patchset introduces topological CPU IDs (cids) - dense,
topology-ordered cpu identifiers - and an alternative cid-form struct_ops
type that lets BPF schedulers operate in cid space directly.
Key pieces:
- cid space: scx_cid_init() walks nodes * LLCs * cores * threads and packs
a dense cid mapping. The mapping can be overridden via
scx_bpf_cid_override(). See "Topological CPU IDs" in ext_cid.h for the
model.
- cmask: a base-windowed bitmap over cid space. Kernel and BPF helpers with
identical semantics. Used by scx_qmap for per-task affinity and idle-cid
tracking; meant to be the substrate for sub-sched cid allocation.
- bpf_sched_ext_ops_cid: a parallel struct_ops type whose callbacks take
cids/cmasks instead of cpus/cpumasks. Kernel translates at the boundary
via scx_cpu_arg() / scx_cpu_ret(); the two struct types share offsets up
through @priv (verified by BUILD_BUG_ON) so the union view in scx_sched
works without function-pointer casts. Sub-sched support is tied to
cid-form: validate_ops() rejects cpu-form sub-scheds and cpu-form roots
that expose sub_attach / sub_detach.
- cid-form kfuncs: scx_bpf_kick_cid, scx_bpf_cidperf_{cap,cur,set},
scx_bpf_cid_curr, scx_bpf_task_cid, scx_bpf_this_cid,
scx_bpf_nr_{cids,online_cids}, scx_bpf_cid_to_cpu, scx_bpf_cpu_to_cid.
A cid-form program may not call cpu-only kfuncs (enforced at verifier
load via scx_kfunc_context_filter); the reverse is intentionally
permissive to ease migration.
- scx_qmap port: scx_qmap is converted to cid-form. It uses the cmask-based
idle picker, per-task cid-space cpus_allowed, and cid-form kfuncs
throughout. Sub-sched dispatching via scx_bpf_sub_dispatch() continues to
work.
v3 re-tested on the 16-cpu QEMU: cid-form scx_qmap under stress-ng plus
reload cycles, hotplug auto-restart, and sub-sched (root scx_qmap +
cgroup-scoped scx_qmap child). Clean.
Based on sched_ext/for-7.2 (4939721aad2e).
0001-sched_ext-Add-ext_types.h-for-early-subsystem-wide-d.patch
0002-sched_ext-Rename-ops_cpu_valid-to-scx_cpu_valid-and-.patch
0003-sched_ext-Move-scx_exit-scx_error-and-friends-to-ext.patch
0004-sched_ext-Shift-scx_kick_cpu-validity-check-to-scx_b.patch
0005-sched_ext-Relocate-cpu_acquire-cpu_release-to-end-of.patch
0006-sched_ext-Make-scx_enable-take-scx_enable_cmd.patch
0007-sched_ext-Add-topological-CPU-IDs-cids.patch
0008-sched_ext-Add-scx_bpf_cid_override-kfunc.patch
0009-tools-sched_ext-Add-struct_size-helpers-to-common.bp.patch
0010-sched_ext-Add-cmask-a-base-windowed-bitmap-over-cid-.patch
0011-sched_ext-Add-cid-form-kfunc-wrappers-alongside-cpu-.patch
0012-sched_ext-Add-bpf_sched_ext_ops_cid-struct_ops-type.patch
0013-sched_ext-Forbid-cpu-form-kfuncs-from-cid-form-sched.patch
0014-tools-sched_ext-scx_qmap-Restart-on-hotplug-instead-.patch
0015-tools-sched_ext-scx_qmap-Add-cmask-based-idle-tracki.patch
0016-tools-sched_ext-scx_qmap-Port-to-cid-form-struct_ops.patch
0017-sched_ext-Require-cid-form-struct_ops-for-sub-sched-.patch
Git tree: git://git.kernel.org/pub/scm/linux/kernel/git/tj/sched_ext.git scx-cid-v3
kernel/sched/build_policy.c | 3 +
kernel/sched/ext.c | 651 ++++++++++++++++++++++++++----
kernel/sched/ext_cid.c | 409 +++++++++++++++++++
kernel/sched/ext_cid.h | 164 ++++++++
kernel/sched/ext_idle.c | 8 +-
kernel/sched/ext_internal.h | 205 +++++++---
kernel/sched/ext_types.h | 104 +++++
tools/sched_ext/include/scx/cid.bpf.h | 667 +++++++++++++++++++++++++++++++
tools/sched_ext/include/scx/common.bpf.h | 23 ++
tools/sched_ext/include/scx/compat.bpf.h | 24 ++
tools/sched_ext/scx_qmap.bpf.c | 346 +++++++++-------
tools/sched_ext/scx_qmap.c | 70 +++-
tools/sched_ext/scx_qmap.h | 2 +-
13 files changed, 2391 insertions(+), 285 deletions(-)
Thanks.
--
tejun
^ permalink raw reply [flat|nested] 3+ messages in thread* [PATCH 10/17] sched_ext: Add cmask, a base-windowed bitmap over cid space
2026-04-28 20:35 [PATCHSET v3 sched_ext/for-7.2] sched_ext: Topological CPU IDs and cid-form struct_ops Tejun Heo
@ 2026-04-28 20:35 ` Tejun Heo
0 siblings, 0 replies; 3+ messages in thread
From: Tejun Heo @ 2026-04-28 20:35 UTC (permalink / raw)
To: David Vernet, Andrea Righi, Changwoo Min
Cc: sched-ext, Emil Tsalapatis, linux-kernel, Tejun Heo,
Cheng-Yang Chou
Sub-scheduler code built on cids needs bitmaps scoped to a slice of cid
space (e.g. the idle cids of a shard). A cpumask sized for NR_CPUS wastes
most of its bits for a small window and is awkward in BPF.
scx_cmask covers [base, base + nr_bits). bits[] is aligned to the global
64-cid grid: bits[0] spans [base & ~63, (base & ~63) + 64). Any two
cmasks therefore address bits[] against the same global windows, so
cross-cmask word ops reduce to
dest->bits[i] OP= operand->bits[i - delta]
with no bit-shifting, at the cost of up to one extra storage word for
head misalignment. This alignment guarantee is the reason binary ops
can stay word-level; every mutating helper preserves it.
Kernel side in ext_cid.[hc]; BPF side in tools/sched_ext/include/scx/
cid.bpf.h. BPF side drops the scx_ prefix (redundant in BPF code) and
adds the extra helpers that basic idle-cpu selection needs.
No callers yet.
v2: Narrow to helpers that will be used in the planned changes;
set/bit/find/zero ops will be added as usage develops.
v3: cmask_copy_from_kernel: validate src->base == 0 via probe-read;
bit-level nr_bits check instead of round-up word count. (Sashiko)
Signed-off-by: Tejun Heo <tj@kernel.org>
Reviewed-by: Cheng-Yang Chou <yphbchou0911@gmail.com>
---
kernel/sched/ext_cid.h | 25 +
kernel/sched/ext_types.h | 38 ++
tools/sched_ext/include/scx/cid.bpf.h | 667 +++++++++++++++++++++++
tools/sched_ext/include/scx/common.bpf.h | 1 +
4 files changed, 731 insertions(+)
create mode 100644 tools/sched_ext/include/scx/cid.bpf.h
diff --git a/kernel/sched/ext_cid.h b/kernel/sched/ext_cid.h
index 52edb66b53fd..c3c429d2c8e2 100644
--- a/kernel/sched/ext_cid.h
+++ b/kernel/sched/ext_cid.h
@@ -127,4 +127,29 @@ static inline s32 scx_cpu_to_cid(struct scx_sched *sch, s32 cpu)
return __scx_cpu_to_cid(cpu);
}
+static inline bool __scx_cmask_contains(const struct scx_cmask *m, u32 cid)
+{
+ return likely(cid >= m->base && cid < m->base + m->nr_bits);
+}
+
+/* Word in bits[] covering @cid. @cid must satisfy __scx_cmask_contains(). */
+static inline u64 *__scx_cmask_word(const struct scx_cmask *m, u32 cid)
+{
+ return (u64 *)&m->bits[cid / 64 - m->base / 64];
+}
+
+static inline void scx_cmask_init(struct scx_cmask *m, u32 base, u32 nr_bits)
+{
+ m->base = base;
+ m->nr_bits = nr_bits;
+ memset(m->bits, 0, SCX_CMASK_NR_WORDS(nr_bits) * sizeof(u64));
+}
+
+static inline void __scx_cmask_set(struct scx_cmask *m, u32 cid)
+{
+ if (!__scx_cmask_contains(m, cid))
+ return;
+ *__scx_cmask_word(m, cid) |= BIT_U64(cid & 63);
+}
+
#endif /* _KERNEL_SCHED_EXT_CID_H */
diff --git a/kernel/sched/ext_types.h b/kernel/sched/ext_types.h
index be4d3565ae8d..ebb8cdf90612 100644
--- a/kernel/sched/ext_types.h
+++ b/kernel/sched/ext_types.h
@@ -63,4 +63,42 @@ struct scx_cid_topo {
s32 node_idx;
};
+/*
+ * cmask: variable-length, base-windowed bitmap over cid space
+ * -----------------------------------------------------------
+ *
+ * A cmask covers the cid range [base, base + nr_bits). bits[] is aligned to the
+ * global 64-cid grid: bits[0] spans [base & ~63, (base & ~63) + 64), so the
+ * first (base & 63) bits of bits[0] are head padding and any tail past base +
+ * nr_bits is tail padding. Both must stay zero for the lifetime of the mask;
+ * all mutating helpers preserve that invariant.
+ *
+ * Grid alignment means two cmasks always address bits[] against the same global
+ * 64-cid windows, so cross-cmask word ops (AND, OR, ...) reduce to
+ *
+ * dst->bits[i] OP= src->bits[i - delta]
+ *
+ * with no bit-shifting, regardless of how the two bases relate mod 64.
+ */
+struct scx_cmask {
+ u32 base;
+ u32 nr_bits;
+ DECLARE_FLEX_ARRAY(u64, bits);
+};
+
+/*
+ * Number of u64 words of bits[] storage that covers @nr_bits regardless of base
+ * alignment. The +1 absorbs up to 63 bits of head padding when base is not
+ * 64-aligned - always allocating one extra word beats branching on base or
+ * splitting the compute.
+ */
+#define SCX_CMASK_NR_WORDS(nr_bits) (((nr_bits) + 63) / 64 + 1)
+
+/*
+ * Define an on-stack cmask for up to @cap_bits. @name is a struct scx_cmask *
+ * aliasing zero-initialized storage; call scx_cmask_init() to set base/nr_bits.
+ */
+#define SCX_CMASK_DEFINE(name, cap_bits) \
+ DEFINE_RAW_FLEX(struct scx_cmask, name, bits, SCX_CMASK_NR_WORDS(cap_bits))
+
#endif /* _KERNEL_SCHED_EXT_TYPES_H */
diff --git a/tools/sched_ext/include/scx/cid.bpf.h b/tools/sched_ext/include/scx/cid.bpf.h
new file mode 100644
index 000000000000..960108708eed
--- /dev/null
+++ b/tools/sched_ext/include/scx/cid.bpf.h
@@ -0,0 +1,667 @@
+/* SPDX-License-Identifier: GPL-2.0 */
+/*
+ * BPF-side helpers for cids and cmasks. See kernel/sched/ext_cid.h for the
+ * authoritative layout and semantics. The BPF-side helpers use the cmask_*
+ * naming (no scx_ prefix); cmask is the SCX bitmap type so the prefix is
+ * redundant in BPF code. Atomics use __sync_val_compare_and_swap and every
+ * helper is inline (no .c counterpart).
+ *
+ * Included by scx/common.bpf.h; don't include directly.
+ *
+ * Copyright (c) 2026 Meta Platforms, Inc. and affiliates.
+ * Copyright (c) 2026 Tejun Heo <tj@kernel.org>
+ */
+#ifndef __SCX_CID_BPF_H
+#define __SCX_CID_BPF_H
+
+#include "bpf_arena_common.bpf.h"
+
+#ifndef BIT_U64
+#define BIT_U64(nr) (1ULL << (nr))
+#endif
+#ifndef GENMASK_U64
+#define GENMASK_U64(h, l) ((~0ULL << (l)) & (~0ULL >> (63 - (h))))
+#endif
+
+/*
+ * Storage cap for bounded loops over bits[]. Sized to cover NR_CPUS=8192 with
+ * one extra word for head-misalignment. Increase if deployment targets larger
+ * NR_CPUS.
+ */
+#ifndef CMASK_MAX_WORDS
+#define CMASK_MAX_WORDS 129
+#endif
+
+#define CMASK_NR_WORDS(nr_bits) (((nr_bits) + 63) / 64 + 1)
+
+static __always_inline bool __cmask_contains(const struct scx_cmask __arena *m, u32 cid)
+{
+ return cid >= m->base && cid < m->base + m->nr_bits;
+}
+
+static __always_inline u64 __arena *__cmask_word(const struct scx_cmask __arena *m, u32 cid)
+{
+ return (u64 __arena *)&m->bits[cid / 64 - m->base / 64];
+}
+
+static __always_inline void cmask_init(struct scx_cmask __arena *m, u32 base, u32 nr_bits)
+{
+ u32 nr_words = CMASK_NR_WORDS(nr_bits), i;
+
+ m->base = base;
+ m->nr_bits = nr_bits;
+
+ bpf_for(i, 0, CMASK_MAX_WORDS) {
+ if (i >= nr_words)
+ break;
+ m->bits[i] = 0;
+ }
+}
+
+static __always_inline bool cmask_test(const struct scx_cmask __arena *m, u32 cid)
+{
+ if (!__cmask_contains(m, cid))
+ return false;
+ return *__cmask_word(m, cid) & BIT_U64(cid & 63);
+}
+
+/*
+ * x86 BPF JIT rejects BPF_OR | BPF_FETCH and BPF_AND | BPF_FETCH on arena
+ * pointers (see bpf_jit_supports_insn() in arch/x86/net/bpf_jit_comp.c). Only
+ * BPF_CMPXCHG / BPF_XCHG / BPF_ADD with FETCH are allowed. Implement
+ * test_and_{set,clear} and the atomic set/clear via a cmpxchg loop.
+ *
+ * CMASK_CAS_TRIES is far above what any non-pathological contention needs.
+ * Exhausting it means the bit update was lost, which corrupts the caller's view
+ * of the bitmap, so raise scx_bpf_error() to abort the scheduler.
+ */
+#define CMASK_CAS_TRIES 1024
+
+static __always_inline void cmask_set(struct scx_cmask __arena *m, u32 cid)
+{
+ u64 __arena *w;
+ u64 bit, old, new;
+ u32 i;
+
+ if (!__cmask_contains(m, cid))
+ return;
+ w = __cmask_word(m, cid);
+ bit = BIT_U64(cid & 63);
+ bpf_for(i, 0, CMASK_CAS_TRIES) {
+ old = *w;
+ if (old & bit)
+ return;
+ new = old | bit;
+ if (__sync_val_compare_and_swap(w, old, new) == old)
+ return;
+ }
+ scx_bpf_error("cmask_set CAS exhausted at cid %u", cid);
+}
+
+static __always_inline void cmask_clear(struct scx_cmask __arena *m, u32 cid)
+{
+ u64 __arena *w;
+ u64 bit, old, new;
+ u32 i;
+
+ if (!__cmask_contains(m, cid))
+ return;
+ w = __cmask_word(m, cid);
+ bit = BIT_U64(cid & 63);
+ bpf_for(i, 0, CMASK_CAS_TRIES) {
+ old = *w;
+ if (!(old & bit))
+ return;
+ new = old & ~bit;
+ if (__sync_val_compare_and_swap(w, old, new) == old)
+ return;
+ }
+ scx_bpf_error("cmask_clear CAS exhausted at cid %u", cid);
+}
+
+static __always_inline bool cmask_test_and_set(struct scx_cmask __arena *m, u32 cid)
+{
+ u64 __arena *w;
+ u64 bit, old, new;
+ u32 i;
+
+ if (!__cmask_contains(m, cid))
+ return false;
+ w = __cmask_word(m, cid);
+ bit = BIT_U64(cid & 63);
+ bpf_for(i, 0, CMASK_CAS_TRIES) {
+ old = *w;
+ if (old & bit)
+ return true;
+ new = old | bit;
+ if (__sync_val_compare_and_swap(w, old, new) == old)
+ return false;
+ }
+ scx_bpf_error("cmask_test_and_set CAS exhausted at cid %u", cid);
+ return false;
+}
+
+static __always_inline bool cmask_test_and_clear(struct scx_cmask __arena *m, u32 cid)
+{
+ u64 __arena *w;
+ u64 bit, old, new;
+ u32 i;
+
+ if (!__cmask_contains(m, cid))
+ return false;
+ w = __cmask_word(m, cid);
+ bit = BIT_U64(cid & 63);
+ bpf_for(i, 0, CMASK_CAS_TRIES) {
+ old = *w;
+ if (!(old & bit))
+ return false;
+ new = old & ~bit;
+ if (__sync_val_compare_and_swap(w, old, new) == old)
+ return true;
+ }
+ scx_bpf_error("cmask_test_and_clear CAS exhausted at cid %u", cid);
+ return false;
+}
+
+static __always_inline void __cmask_set(struct scx_cmask __arena *m, u32 cid)
+{
+ if (!__cmask_contains(m, cid))
+ return;
+ *__cmask_word(m, cid) |= BIT_U64(cid & 63);
+}
+
+static __always_inline void __cmask_clear(struct scx_cmask __arena *m, u32 cid)
+{
+ if (!__cmask_contains(m, cid))
+ return;
+ *__cmask_word(m, cid) &= ~BIT_U64(cid & 63);
+}
+
+static __always_inline bool __cmask_test_and_set(struct scx_cmask __arena *m, u32 cid)
+{
+ u64 bit = BIT_U64(cid & 63);
+ u64 __arena *w;
+ u64 prev;
+
+ if (!__cmask_contains(m, cid))
+ return false;
+ w = __cmask_word(m, cid);
+ prev = *w & bit;
+ *w |= bit;
+ return prev;
+}
+
+static __always_inline bool __cmask_test_and_clear(struct scx_cmask __arena *m, u32 cid)
+{
+ u64 bit = BIT_U64(cid & 63);
+ u64 __arena *w;
+ u64 prev;
+
+ if (!__cmask_contains(m, cid))
+ return false;
+ w = __cmask_word(m, cid);
+ prev = *w & bit;
+ *w &= ~bit;
+ return prev;
+}
+
+static __always_inline void cmask_zero(struct scx_cmask __arena *m)
+{
+ u32 nr_words = CMASK_NR_WORDS(m->nr_bits), i;
+
+ bpf_for(i, 0, CMASK_MAX_WORDS) {
+ if (i >= nr_words)
+ break;
+ m->bits[i] = 0;
+ }
+}
+
+/*
+ * BPF_-prefixed to avoid colliding with the kernel's anonymous CMASK_OP_*
+ * enum in ext_cid.c, which is exported via BTF and reachable through
+ * vmlinux.h.
+ */
+enum {
+ BPF_CMASK_OP_AND,
+ BPF_CMASK_OP_OR,
+ BPF_CMASK_OP_COPY,
+ BPF_CMASK_OP_ANDNOT,
+};
+
+static __always_inline void cmask_op_word(struct scx_cmask __arena *dst,
+ const struct scx_cmask __arena *src,
+ u32 di, u32 si, u64 mask, int op)
+{
+ u64 dv = dst->bits[di];
+ u64 sv = src->bits[si];
+ u64 rv;
+
+ if (op == BPF_CMASK_OP_AND)
+ rv = dv & sv;
+ else if (op == BPF_CMASK_OP_OR)
+ rv = dv | sv;
+ else if (op == BPF_CMASK_OP_ANDNOT)
+ rv = dv & ~sv;
+ else
+ rv = sv;
+
+ dst->bits[di] = (dv & ~mask) | (rv & mask);
+}
+
+static __always_inline void cmask_op(struct scx_cmask __arena *dst,
+ const struct scx_cmask __arena *src, int op)
+{
+ u32 d_end = dst->base + dst->nr_bits;
+ u32 s_end = src->base + src->nr_bits;
+ u32 lo = dst->base > src->base ? dst->base : src->base;
+ u32 hi = d_end < s_end ? d_end : s_end;
+ u32 d_base = dst->base / 64;
+ u32 s_base = src->base / 64;
+ u32 lo_word, hi_word, i;
+ u64 head_mask, tail_mask;
+
+ if (lo >= hi)
+ return;
+
+ lo_word = lo / 64;
+ hi_word = (hi - 1) / 64;
+ head_mask = GENMASK_U64(63, lo & 63);
+ tail_mask = GENMASK_U64((hi - 1) & 63, 0);
+
+ bpf_for(i, 0, CMASK_MAX_WORDS) {
+ u32 w = lo_word + i;
+ u64 m;
+
+ if (w > hi_word)
+ break;
+
+ m = GENMASK_U64(63, 0);
+ if (w == lo_word)
+ m &= head_mask;
+ if (w == hi_word)
+ m &= tail_mask;
+
+ cmask_op_word(dst, src, w - d_base, w - s_base, m, op);
+ }
+}
+
+/*
+ * cmask_and/or/copy only modify @dst bits that lie in the intersection of
+ * [@dst->base, @dst->base + @dst->nr_bits) and [@src->base,
+ * @src->base + @src->nr_bits). Bits in @dst outside that window
+ * keep their prior values - in particular, cmask_copy() does NOT zero @dst
+ * bits that lie outside @src's range.
+ */
+static __always_inline void cmask_and(struct scx_cmask __arena *dst,
+ const struct scx_cmask __arena *src)
+{
+ cmask_op(dst, src, BPF_CMASK_OP_AND);
+}
+
+static __always_inline void cmask_or(struct scx_cmask __arena *dst,
+ const struct scx_cmask __arena *src)
+{
+ cmask_op(dst, src, BPF_CMASK_OP_OR);
+}
+
+static __always_inline void cmask_copy(struct scx_cmask __arena *dst,
+ const struct scx_cmask __arena *src)
+{
+ cmask_op(dst, src, BPF_CMASK_OP_COPY);
+}
+
+static __always_inline void cmask_andnot(struct scx_cmask __arena *dst,
+ const struct scx_cmask __arena *src)
+{
+ cmask_op(dst, src, BPF_CMASK_OP_ANDNOT);
+}
+
+/*
+ * True iff @a and @b have identical bits over their (assumed equal) range.
+ * Callers are expected to pass same-shape cmasks; differing shapes always
+ * compare unequal.
+ */
+static __always_inline bool cmask_equal(const struct scx_cmask __arena *a,
+ const struct scx_cmask __arena *b)
+{
+ u32 nr_words, i;
+
+ if (a->base != b->base || a->nr_bits != b->nr_bits)
+ return false;
+ nr_words = CMASK_NR_WORDS(a->nr_bits);
+
+ bpf_for(i, 0, CMASK_MAX_WORDS) {
+ if (i >= nr_words)
+ break;
+ if (a->bits[i] != b->bits[i])
+ return false;
+ }
+ return true;
+}
+
+/*
+ * True iff every bit set in @a is also set in @b over the intersection of
+ * their ranges. Bits of @a outside @b's range fail the test.
+ */
+static __always_inline bool cmask_subset(const struct scx_cmask __arena *a,
+ const struct scx_cmask __arena *b)
+{
+ u32 a_end = a->base + a->nr_bits;
+ u32 b_end = b->base + b->nr_bits;
+ u32 a_wbase = a->base / 64;
+ u32 b_wbase = b->base / 64;
+ u32 nr_words, i;
+
+ /* any bit of @a outside @b's range is a subset violation */
+ if (a->base < b->base || a_end > b_end)
+ return false;
+
+ nr_words = CMASK_NR_WORDS(a->nr_bits);
+ bpf_for(i, 0, CMASK_MAX_WORDS) {
+ u32 wi_b;
+
+ if (i >= nr_words)
+ break;
+ wi_b = a_wbase + i - b_wbase;
+ if (a->bits[i] & ~b->bits[wi_b])
+ return false;
+ }
+ return true;
+}
+
+/**
+ * cmask_next_set - find the first set bit at or after @cid
+ * @m: cmask to search
+ * @cid: starting cid (clamped to @m->base if below)
+ *
+ * Returns the smallest set cid in [@cid, @m->base + @m->nr_bits), or
+ * @m->base + @m->nr_bits if none (the out-of-range sentinel matches the
+ * termination condition used by cmask_for_each()).
+ */
+static __always_inline u32 cmask_next_set(const struct scx_cmask __arena *m, u32 cid)
+{
+ u32 end = m->base + m->nr_bits;
+ u32 base = m->base / 64;
+ u32 last_wi = (end - 1) / 64 - base;
+ u32 start_wi, start_bit, i;
+
+ if (cid < m->base)
+ cid = m->base;
+ if (cid >= end)
+ return end;
+
+ start_wi = cid / 64 - base;
+ start_bit = cid & 63;
+
+ bpf_for(i, 0, CMASK_MAX_WORDS) {
+ u32 wi = start_wi + i;
+ u64 word;
+ u32 found;
+
+ if (wi > last_wi)
+ break;
+
+ word = m->bits[wi];
+ if (i == 0)
+ word &= GENMASK_U64(63, start_bit);
+ if (!word)
+ continue;
+
+ found = (base + wi) * 64 + __builtin_ctzll(word);
+ if (found >= end)
+ return end;
+ return found;
+ }
+ return end;
+}
+
+static __always_inline u32 cmask_first_set(const struct scx_cmask __arena *m)
+{
+ return cmask_next_set(m, m->base);
+}
+
+#define cmask_for_each(cid, m) \
+ for ((cid) = cmask_first_set(m); \
+ (cid) < (m)->base + (m)->nr_bits; \
+ (cid) = cmask_next_set((m), (cid) + 1))
+
+/*
+ * Population count over [base, base + nr_bits). Padding bits in the head/tail
+ * words are guaranteed zero by the mutating helpers, so a flat popcount over
+ * all words is correct.
+ */
+static __always_inline u32 cmask_weight(const struct scx_cmask __arena *m)
+{
+ u32 nr_words = CMASK_NR_WORDS(m->nr_bits), i;
+ u32 count = 0;
+
+ bpf_for(i, 0, CMASK_MAX_WORDS) {
+ if (i >= nr_words)
+ break;
+ count += __builtin_popcountll(m->bits[i]);
+ }
+ return count;
+}
+
+/*
+ * True if @a and @b share any set bit. Walk only the intersection of their
+ * ranges, matching the semantics of cmask_and().
+ */
+static __always_inline bool cmask_intersects(const struct scx_cmask __arena *a,
+ const struct scx_cmask __arena *b)
+{
+ u32 a_end = a->base + a->nr_bits;
+ u32 b_end = b->base + b->nr_bits;
+ u32 lo = a->base > b->base ? a->base : b->base;
+ u32 hi = a_end < b_end ? a_end : b_end;
+ u32 a_base = a->base / 64;
+ u32 b_base = b->base / 64;
+ u32 lo_word, hi_word, i;
+ u64 head_mask, tail_mask;
+
+ if (lo >= hi)
+ return false;
+
+ lo_word = lo / 64;
+ hi_word = (hi - 1) / 64;
+ head_mask = GENMASK_U64(63, lo & 63);
+ tail_mask = GENMASK_U64((hi - 1) & 63, 0);
+
+ bpf_for(i, 0, CMASK_MAX_WORDS) {
+ u32 w = lo_word + i;
+ u64 mask, av, bv;
+
+ if (w > hi_word)
+ break;
+
+ mask = GENMASK_U64(63, 0);
+ if (w == lo_word)
+ mask &= head_mask;
+ if (w == hi_word)
+ mask &= tail_mask;
+
+ av = a->bits[w - a_base] & mask;
+ bv = b->bits[w - b_base] & mask;
+ if (av & bv)
+ return true;
+ }
+ return false;
+}
+
+/*
+ * Find the next cid set in both @a and @b at or after @start, bounded by the
+ * intersection of the two ranges. Return a->base + a->nr_bits if none found.
+ *
+ * Building block for cmask_next_and_set_wrap(). Callers that want a bounded
+ * scan without wrap call this directly.
+ */
+static __always_inline u32 cmask_next_and_set(const struct scx_cmask __arena *a,
+ const struct scx_cmask __arena *b,
+ u32 start)
+{
+ u32 a_end = a->base + a->nr_bits;
+ u32 b_end = b->base + b->nr_bits;
+ u32 a_wbase = a->base / 64;
+ u32 b_wbase = b->base / 64;
+ u32 lo = a->base > b->base ? a->base : b->base;
+ u32 hi = a_end < b_end ? a_end : b_end;
+ u32 last_wi, start_wi, start_bit, i;
+
+ if (lo >= hi)
+ return a_end;
+ if (start < lo)
+ start = lo;
+ if (start >= hi)
+ return a_end;
+
+ last_wi = (hi - 1) / 64;
+ start_wi = start / 64;
+ start_bit = start & 63;
+
+ bpf_for(i, 0, CMASK_MAX_WORDS) {
+ u32 abs_wi = start_wi + i;
+ u64 word;
+ u32 found;
+
+ if (abs_wi > last_wi)
+ break;
+
+ word = a->bits[abs_wi - a_wbase] & b->bits[abs_wi - b_wbase];
+ if (i == 0)
+ word &= GENMASK_U64(63, start_bit);
+ if (!word)
+ continue;
+
+ found = abs_wi * 64 + __builtin_ctzll(word);
+ if (found >= hi)
+ return a_end;
+ return found;
+ }
+ return a_end;
+}
+
+/*
+ * Find the next set cid in @m at or after @start, wrapping to @m->base if no
+ * set bit is found in [start, m->base + m->nr_bits). Return m->base +
+ * m->nr_bits if @m is empty.
+ *
+ * Callers do round-robin distribution by passing (last_cid + 1) as @start.
+ */
+static __always_inline u32 cmask_next_set_wrap(const struct scx_cmask __arena *m,
+ u32 start)
+{
+ u32 end = m->base + m->nr_bits;
+ u32 found;
+
+ found = cmask_next_set(m, start);
+ if (found < end || start <= m->base)
+ return found;
+
+ found = cmask_next_set(m, m->base);
+ return found < start ? found : end;
+}
+
+/*
+ * Find the next cid set in both @a and @b at or after @start, wrapping to
+ * @a->base if none found in the forward half. Return a->base + a->nr_bits
+ * if the intersection is empty.
+ *
+ * Callers do round-robin distribution by passing (last_cid + 1) as @start.
+ */
+static __always_inline u32 cmask_next_and_set_wrap(const struct scx_cmask __arena *a,
+ const struct scx_cmask __arena *b,
+ u32 start)
+{
+ u32 a_end = a->base + a->nr_bits;
+ u32 found;
+
+ found = cmask_next_and_set(a, b, start);
+ if (found < a_end || start <= a->base)
+ return found;
+
+ found = cmask_next_and_set(a, b, a->base);
+ return found < start ? found : a_end;
+}
+
+/**
+ * cmask_from_cpumask - translate a kernel cpumask to a cid-space cmask
+ * @m: cmask to fill. Zeroed first; only bits within [@m->base, @m->base +
+ * @m->nr_bits) are updated - cpus mapping to cids outside that range
+ * are ignored.
+ * @cpumask: kernel cpumask to translate
+ *
+ * For each cpu in @cpumask, set the cpu's cid in @m. Caller must ensure
+ * @cpumask stays stable across the call (e.g. RCU read lock for
+ * task->cpus_ptr).
+ */
+static __always_inline void cmask_from_cpumask(struct scx_cmask __arena *m,
+ const struct cpumask *cpumask)
+{
+ u32 nr_cpu_ids = scx_bpf_nr_cpu_ids();
+ s32 cpu;
+
+ cmask_zero(m);
+ bpf_for(cpu, 0, nr_cpu_ids) {
+ s32 cid;
+
+ if (!bpf_cpumask_test_cpu(cpu, cpumask))
+ continue;
+ cid = scx_bpf_cpu_to_cid(cpu);
+ if (cid >= 0)
+ __cmask_set(m, cid);
+ }
+}
+
+/**
+ * cmask_copy_from_kernel - probe-read a kernel cmask into an arena cmask
+ * @dst: arena cmask to fill; must have @dst->base == 0 and be sized for @src.
+ * @src: kernel-memory cmask (e.g. ops.set_cmask() arg); @src->base must be 0.
+ *
+ * Word-for-word copy; @src and @dst must share base 0 alignment. Triggers
+ * scx_bpf_error() on probe failure or precondition violation.
+ */
+static __always_inline void cmask_copy_from_kernel(struct scx_cmask __arena *dst,
+ const struct scx_cmask *src)
+{
+ u32 base = 0, nr_bits = 0, nr_words, wi;
+
+ if (dst->base != 0) {
+ scx_bpf_error("cmask_copy_from_kernel requires dst->base == 0");
+ return;
+ }
+
+ if (bpf_probe_read_kernel(&base, sizeof(base), &src->base)) {
+ scx_bpf_error("probe-read cmask->base failed");
+ return;
+ }
+ if (base != 0) {
+ scx_bpf_error("cmask_copy_from_kernel requires src->base == 0");
+ return;
+ }
+
+ if (bpf_probe_read_kernel(&nr_bits, sizeof(nr_bits), &src->nr_bits)) {
+ scx_bpf_error("probe-read cmask->nr_bits failed");
+ return;
+ }
+
+ if (nr_bits > dst->nr_bits) {
+ scx_bpf_error("src cmask nr_bits=%u exceeds dst nr_bits=%u",
+ nr_bits, dst->nr_bits);
+ return;
+ }
+
+ nr_words = CMASK_NR_WORDS(nr_bits);
+ cmask_zero(dst);
+ bpf_for(wi, 0, CMASK_MAX_WORDS) {
+ u64 word = 0;
+ if (wi >= nr_words)
+ break;
+ if (bpf_probe_read_kernel(&word, sizeof(u64), &src->bits[wi])) {
+ scx_bpf_error("probe-read cmask->bits[%u] failed", wi);
+ return;
+ }
+ dst->bits[wi] = word;
+ }
+}
+
+#endif /* __SCX_CID_BPF_H */
diff --git a/tools/sched_ext/include/scx/common.bpf.h b/tools/sched_ext/include/scx/common.bpf.h
index 087ae4f79c60..ff57a7acdbeb 100644
--- a/tools/sched_ext/include/scx/common.bpf.h
+++ b/tools/sched_ext/include/scx/common.bpf.h
@@ -1055,5 +1055,6 @@ static inline u64 scx_clock_irq(u32 cpu)
#include "compat.bpf.h"
#include "enums.bpf.h"
+#include "cid.bpf.h"
#endif /* __SCX_COMMON_BPF_H */
--
2.54.0
^ permalink raw reply related [flat|nested] 3+ messages in thread
end of thread, other threads:[~2026-04-28 20:35 UTC | newest]
Thread overview: 3+ messages (download: mbox.gz follow: Atom feed
-- links below jump to the message on this page --
2026-04-24 1:32 [PATCH 10/17] sched_ext: Add cmask, a base-windowed bitmap over cid space Tejun Heo
-- strict thread matches above, loose matches on Subject: below --
2026-04-24 17:27 [PATCHSET v2 REPOST sched_ext/for-7.2] sched_ext: Topological CPU IDs and cid-form struct_ops Tejun Heo
2026-04-24 17:27 ` [PATCH 10/17] sched_ext: Add cmask, a base-windowed bitmap over cid space Tejun Heo
2026-04-28 20:35 [PATCHSET v3 sched_ext/for-7.2] sched_ext: Topological CPU IDs and cid-form struct_ops Tejun Heo
2026-04-28 20:35 ` [PATCH 10/17] sched_ext: Add cmask, a base-windowed bitmap over cid space Tejun Heo
This is a public inbox, see mirroring instructions
for how to clone and mirror all data and code used for this inbox