From: Tejun Heo <tj@kernel.org>
To: David Vernet <void@manifault.com>,
Andrea Righi <arighi@nvidia.com>,
Changwoo Min <changwoo@igalia.com>,
Alexei Starovoitov <ast@kernel.org>,
Andrii Nakryiko <andrii@kernel.org>,
Daniel Borkmann <daniel@iogearbox.net>,
Martin KaFai Lau <martin.lau@linux.dev>,
Kumar Kartikeya Dwivedi <memxor@gmail.com>
Cc: Peter Zijlstra <peterz@infradead.org>,
Catalin Marinas <catalin.marinas@arm.com>,
Will Deacon <will@kernel.org>, Thomas Gleixner <tglx@kernel.org>,
Ingo Molnar <mingo@redhat.com>, Borislav Petkov <bp@alien8.de>,
Dave Hansen <dave.hansen@linux.intel.com>,
Andrew Morton <akpm@linux-foundation.org>,
David Hildenbrand <david@kernel.org>,
Mike Rapoport <rppt@kernel.org>,
Emil Tsalapatis <emil@etsalapatis.com>,
sched-ext@lists.linux.dev, bpf@vger.kernel.org, x86@kernel.org,
linux-arm-kernel@lists.infradead.org, linux-mm@kvack.org,
linux-kernel@vger.kernel.org, Tejun Heo <tj@kernel.org>
Subject: [PATCH 8/8] sched_ext: Convert ops.set_cmask() to arena-resident cmask
Date: Wed, 20 May 2026 13:50:52 -1000 [thread overview]
Message-ID: <20260520235052.4180316-9-tj@kernel.org> (raw)
In-Reply-To: <20260520235052.4180316-1-tj@kernel.org>
ops_cid.set_cmask() expects a cmask. The kernel couldn't write into the
arena, so it translated cpumask -> cmask in kernel memory and passed the
result as a trusted pointer. The BPF cmask helpers all operate on arena
cmasks though, so the BPF side had to word-by-word probe-read the kernel
cmask into an arena cmask via cmask_copy_from_kernel() before any helper
could touch it. It works, but is clumsy.
With direct kernel-side arena access now in place, build the cmask in the
arena. The kernel writes to it through the kern_va side of the dual mapping;
BPF directly dereferences it via an __arena pointer like any other arena
struct.
Signed-off-by: Tejun Heo <tj@kernel.org>
---
kernel/sched/ext.c | 68 +++++++++++++++++++++++++--
kernel/sched/ext_cid.c | 20 +-------
kernel/sched/ext_internal.h | 10 +++-
tools/sched_ext/include/scx/cid.bpf.h | 52 --------------------
tools/sched_ext/scx_qmap.bpf.c | 5 +-
5 files changed, 75 insertions(+), 80 deletions(-)
diff --git a/kernel/sched/ext.c b/kernel/sched/ext.c
index fb91079c1244..94562e3350c6 100644
--- a/kernel/sched/ext.c
+++ b/kernel/sched/ext.c
@@ -621,11 +621,16 @@ static inline void scx_call_op_set_cpumask(struct scx_sched *sch, struct rq *rq,
update_locked_rq(rq);
if (scx_is_cid_type()) {
- struct scx_cmask *cmask = this_cpu_ptr(scx_set_cmask_scratch);
-
- lockdep_assert_irqs_disabled();
- scx_cpumask_to_cmask(cpumask, cmask);
- sch->ops_cid.set_cmask(task, cmask);
+ struct scx_cmask *kern_va = *this_cpu_ptr(sch->set_cmask_scratch);
+ unsigned long uaddr = (unsigned long)kern_va -
+ bpf_arena_map_kern_vm_start(sch->arena_map);
+ /*
+ * Build the per-CPU arena cmask and hand BPF the uaddr. Caller
+ * holds the rq lock with IRQs disabled, which makes us the sole
+ * user of the scratch area.
+ */
+ scx_cpumask_to_cmask(cpumask, kern_va);
+ sch->ops_cid.set_cmask(task, (struct scx_cmask *)uaddr);
} else {
sch->ops.set_cpumask(task, cpumask);
}
@@ -4949,6 +4954,48 @@ static const struct attribute_group scx_global_attr_group = {
static void free_pnode(struct scx_sched_pnode *pnode);
static void free_exit_info(struct scx_exit_info *ei);
+static s32 scx_set_cmask_scratch_alloc(struct scx_sched *sch)
+{
+ size_t size = struct_size_t(struct scx_cmask, bits,
+ SCX_CMASK_NR_WORDS(num_possible_cpus()));
+ int cpu;
+
+ if (!sch->is_cid_type || !sch->arena_pool)
+ return 0;
+
+ sch->set_cmask_scratch = alloc_percpu(struct scx_cmask *);
+ if (!sch->set_cmask_scratch)
+ return -ENOMEM;
+
+ for_each_possible_cpu(cpu) {
+ struct scx_cmask **slot = per_cpu_ptr(sch->set_cmask_scratch, cpu);
+
+ *slot = scx_arena_alloc(sch, size);
+ if (!*slot)
+ return -ENOMEM;
+ scx_cmask_init(*slot, 0, num_possible_cpus());
+ }
+ return 0;
+}
+
+static void scx_set_cmask_scratch_free(struct scx_sched *sch)
+{
+ size_t size = struct_size_t(struct scx_cmask, bits,
+ SCX_CMASK_NR_WORDS(num_possible_cpus()));
+ int cpu;
+
+ if (!sch->set_cmask_scratch)
+ return;
+
+ for_each_possible_cpu(cpu) {
+ struct scx_cmask **slot = per_cpu_ptr(sch->set_cmask_scratch, cpu);
+
+ scx_arena_free(sch, *slot, size);
+ }
+ free_percpu(sch->set_cmask_scratch);
+ sch->set_cmask_scratch = NULL;
+}
+
static void scx_sched_free_rcu_work(struct work_struct *work)
{
struct rcu_work *rcu_work = to_rcu_work(work);
@@ -5003,6 +5050,7 @@ static void scx_sched_free_rcu_work(struct work_struct *work)
rhashtable_free_and_destroy(&sch->dsq_hash, NULL, NULL);
free_exit_info(sch->exit_info);
+ scx_set_cmask_scratch_free(sch);
scx_arena_pool_destroy(sch);
if (sch->arena_map)
bpf_map_put(sch->arena_map);
@@ -7162,6 +7210,12 @@ static void scx_root_enable_workfn(struct kthread_work *work)
goto err_disable;
}
+ ret = scx_set_cmask_scratch_alloc(sch);
+ if (ret) {
+ cpus_read_unlock();
+ goto err_disable;
+ }
+
for (i = SCX_OPI_CPU_HOTPLUG_BEGIN; i < SCX_OPI_CPU_HOTPLUG_END; i++)
if (((void (**)(void))ops)[i])
set_bit(i, sch->has_op);
@@ -7484,6 +7538,10 @@ static void scx_sub_enable_workfn(struct kthread_work *work)
if (ret)
goto err_disable;
+ ret = scx_set_cmask_scratch_alloc(sch);
+ if (ret)
+ goto err_disable;
+
if (validate_ops(sch, ops))
goto err_disable;
diff --git a/kernel/sched/ext_cid.c b/kernel/sched/ext_cid.c
index 0c91b951fd33..808c6390da5a 100644
--- a/kernel/sched/ext_cid.c
+++ b/kernel/sched/ext_cid.c
@@ -7,14 +7,6 @@
*/
#include <linux/cacheinfo.h>
-/*
- * Per-cpu scratch cmask used by scx_call_op_set_cpumask() to synthesize a
- * cmask from a cpumask. Allocated alongside the cid arrays on first enable
- * and never freed. Sized to the full cid space. Caller holds rq lock so
- * this_cpu_ptr is safe.
- */
-struct scx_cmask __percpu *scx_set_cmask_scratch;
-
/*
* cid tables.
*
@@ -54,8 +46,6 @@ static s32 scx_cid_arrays_alloc(void)
u32 npossible = num_possible_cpus();
s16 *cid_to_cpu, *cpu_to_cid;
struct scx_cid_topo *cid_topo;
- struct scx_cmask __percpu *set_cmask_scratch;
- s32 cpu;
if (scx_cid_to_cpu_tbl)
return 0;
@@ -63,25 +53,17 @@ static s32 scx_cid_arrays_alloc(void)
cid_to_cpu = kzalloc_objs(*scx_cid_to_cpu_tbl, npossible, GFP_KERNEL);
cpu_to_cid = kzalloc_objs(*scx_cpu_to_cid_tbl, nr_cpu_ids, GFP_KERNEL);
cid_topo = kmalloc_objs(*scx_cid_topo, npossible, GFP_KERNEL);
- set_cmask_scratch = __alloc_percpu(struct_size(set_cmask_scratch, bits,
- SCX_CMASK_NR_WORDS(npossible)),
- sizeof(u64));
- if (!cid_to_cpu || !cpu_to_cid || !cid_topo || !set_cmask_scratch) {
+ if (!cid_to_cpu || !cpu_to_cid || !cid_topo) {
kfree(cid_to_cpu);
kfree(cpu_to_cid);
kfree(cid_topo);
- free_percpu(set_cmask_scratch);
return -ENOMEM;
}
WRITE_ONCE(scx_cid_to_cpu_tbl, cid_to_cpu);
WRITE_ONCE(scx_cpu_to_cid_tbl, cpu_to_cid);
WRITE_ONCE(scx_cid_topo, cid_topo);
- for_each_possible_cpu(cpu)
- scx_cmask_init(per_cpu_ptr(set_cmask_scratch, cpu),
- 0, npossible);
- WRITE_ONCE(scx_set_cmask_scratch, set_cmask_scratch);
return 0;
}
diff --git a/kernel/sched/ext_internal.h b/kernel/sched/ext_internal.h
index ff7e882bd67a..9bb65367f510 100644
--- a/kernel/sched/ext_internal.h
+++ b/kernel/sched/ext_internal.h
@@ -1124,6 +1124,14 @@ struct scx_sched {
struct bpf_map *arena_map;
struct gen_pool *arena_pool;
+ /*
+ * Per-CPU arena cmask used by scx_call_op_set_cpumask() to hand a cmask
+ * to ops_cid.set_cmask(). The kernel writes through the stored kern_va;
+ * the BPF-arena uaddr handed to BPF is recovered by subtracting the
+ * arena's kern_vm_start.
+ */
+ struct scx_cmask * __percpu *set_cmask_scratch;
+
DECLARE_BITMAP(has_op, SCX_OPI_END);
/*
@@ -1480,8 +1488,6 @@ enum scx_ops_state {
extern struct scx_sched __rcu *scx_root;
DECLARE_PER_CPU(struct rq *, scx_locked_rq_state);
-extern struct scx_cmask __percpu *scx_set_cmask_scratch;
-
/*
* True when the currently loaded scheduler hierarchy is cid-form. All scheds
* in a hierarchy share one form, so this single key tells callsites which
diff --git a/tools/sched_ext/include/scx/cid.bpf.h b/tools/sched_ext/include/scx/cid.bpf.h
index e281c88fa824..70f2a3829af4 100644
--- a/tools/sched_ext/include/scx/cid.bpf.h
+++ b/tools/sched_ext/include/scx/cid.bpf.h
@@ -675,56 +675,4 @@ static __always_inline void cmask_from_cpumask(struct scx_cmask __arena *m,
}
}
-/**
- * cmask_copy_from_kernel - probe-read a kernel cmask into an arena cmask
- * @dst: arena cmask to fill; must have @dst->base == 0 and be sized for @src.
- * @src: kernel-memory cmask (e.g. ops.set_cmask() arg); @src->base must be 0.
- *
- * Word-for-word copy; @src and @dst must share base 0 alignment. Triggers
- * scx_bpf_error() on probe failure or precondition violation.
- */
-static __always_inline void cmask_copy_from_kernel(struct scx_cmask __arena *dst,
- const struct scx_cmask *src)
-{
- u32 base = 0, nr_cids = 0, nr_words, wi;
-
- if (dst->base != 0) {
- scx_bpf_error("cmask_copy_from_kernel requires dst->base == 0");
- return;
- }
-
- if (bpf_probe_read_kernel(&base, sizeof(base), &src->base)) {
- scx_bpf_error("probe-read cmask->base failed");
- return;
- }
- if (base != 0) {
- scx_bpf_error("cmask_copy_from_kernel requires src->base == 0");
- return;
- }
-
- if (bpf_probe_read_kernel(&nr_cids, sizeof(nr_cids), &src->nr_cids)) {
- scx_bpf_error("probe-read cmask->nr_cids failed");
- return;
- }
-
- if (nr_cids > dst->nr_cids) {
- scx_bpf_error("src cmask nr_cids=%u exceeds dst nr_cids=%u",
- nr_cids, dst->nr_cids);
- return;
- }
-
- nr_words = CMASK_NR_WORDS(nr_cids);
- cmask_zero(dst);
- bpf_for(wi, 0, CMASK_MAX_WORDS) {
- u64 word = 0;
- if (wi >= nr_words)
- break;
- if (bpf_probe_read_kernel(&word, sizeof(u64), &src->bits[wi])) {
- scx_bpf_error("probe-read cmask->bits[%u] failed", wi);
- return;
- }
- dst->bits[wi] = word;
- }
-}
-
#endif /* __SCX_CID_BPF_H */
diff --git a/tools/sched_ext/scx_qmap.bpf.c b/tools/sched_ext/scx_qmap.bpf.c
index 7e77f22674ea..8a2d6a8ebd8e 100644
--- a/tools/sched_ext/scx_qmap.bpf.c
+++ b/tools/sched_ext/scx_qmap.bpf.c
@@ -919,14 +919,15 @@ void BPF_STRUCT_OPS(qmap_update_idle, s32 cid, bool idle)
}
void BPF_STRUCT_OPS(qmap_set_cmask, struct task_struct *p,
- const struct scx_cmask *cmask)
+ const struct scx_cmask *cmask_in)
{
+ struct scx_cmask __arena *cmask = (struct scx_cmask __arena *)(long)cmask_in;
task_ctx_t *taskc;
taskc = lookup_task_ctx(p);
if (!taskc)
return;
- cmask_copy_from_kernel(&taskc->cpus_allowed, cmask);
+ cmask_copy(&taskc->cpus_allowed, cmask);
}
struct monitor_timer {
--
2.54.0
next prev parent reply other threads:[~2026-05-20 23:51 UTC|newest]
Thread overview: 26+ messages / expand[flat|nested] mbox.gz Atom feed top
2026-05-20 23:50 [PATCHSET v3 sched_ext/for-7.2] bpf/arena: Direct kernel-side access Tejun Heo
2026-05-20 23:50 ` [PATCH 1/8] mm: Add ptep_try_set() for lockless empty-slot installs Tejun Heo
2026-05-21 7:00 ` Andrea Righi
2026-05-21 17:37 ` [PATCH v3 " Tejun Heo
2026-05-20 23:50 ` [PATCH 2/8] bpf: Recover arena kernel faults with scratch page Tejun Heo
2026-05-21 3:16 ` Emil Tsalapatis
2026-05-21 9:42 ` Alexei Starovoitov
2026-05-21 17:39 ` Tejun Heo
2026-05-20 23:50 ` [PATCH 3/8] bpf: Add sleepable variant of bpf_arena_alloc_pages for kernel callers Tejun Heo
2026-05-21 3:17 ` Emil Tsalapatis
2026-05-20 23:50 ` [PATCH 4/8] bpf: Add bpf_struct_ops_for_each_prog() Tejun Heo
2026-05-21 4:07 ` Emil Tsalapatis
2026-05-20 23:50 ` [PATCH 5/8] bpf/arena: Add bpf_arena_map_kern_vm_start() and bpf_prog_arena() Tejun Heo
2026-05-21 4:08 ` Emil Tsalapatis
2026-05-20 23:50 ` [PATCH 6/8] sched_ext: Require an arena for cid-form schedulers Tejun Heo
2026-05-21 4:15 ` Emil Tsalapatis
2026-05-20 23:50 ` [PATCH 7/8] sched_ext: Sub-allocator over kernel-claimed BPF arena pages Tejun Heo
2026-05-21 7:56 ` Andrea Righi
2026-05-21 17:22 ` Tejun Heo
2026-05-21 17:37 ` [PATCH v2 " Tejun Heo
2026-05-21 17:54 ` Andrea Righi
2026-05-20 23:50 ` Tejun Heo [this message]
2026-05-21 4:19 ` [PATCH 8/8] sched_ext: Convert ops.set_cmask() to arena-resident cmask Emil Tsalapatis
2026-05-22 1:59 ` [PATCH v3 2/8] bpf: Recover arena kernel faults with scratch page Tejun Heo
-- strict thread matches above, loose matches on Subject: below --
2026-05-22 17:22 [PATCHSET v4 sched_ext/for-7.2] bpf/arena: Direct kernel-side access Tejun Heo
2026-05-22 17:22 ` [PATCH 8/8] sched_ext: Convert ops.set_cmask() to arena-resident cmask Tejun Heo
2026-05-17 21:12 [PATCHSET v2 sched_ext/for-7.2] bpf/arena: Direct kernel-side access Tejun Heo
2026-05-17 21:12 ` [PATCH 8/8] sched_ext: Convert ops.set_cmask() to arena-resident cmask Tejun Heo
Reply instructions:
You may reply publicly to this message via plain-text email
using any one of the following methods:
* Save the following mbox file, import it into your mail client,
and reply-to-all from there: mbox
Avoid top-posting and favor interleaved quoting:
https://en.wikipedia.org/wiki/Posting_style#Interleaved_style
* Reply using the --to, --cc, and --in-reply-to
switches of git-send-email(1):
git send-email \
--in-reply-to=20260520235052.4180316-9-tj@kernel.org \
--to=tj@kernel.org \
--cc=akpm@linux-foundation.org \
--cc=andrii@kernel.org \
--cc=arighi@nvidia.com \
--cc=ast@kernel.org \
--cc=bp@alien8.de \
--cc=bpf@vger.kernel.org \
--cc=catalin.marinas@arm.com \
--cc=changwoo@igalia.com \
--cc=daniel@iogearbox.net \
--cc=dave.hansen@linux.intel.com \
--cc=david@kernel.org \
--cc=emil@etsalapatis.com \
--cc=linux-arm-kernel@lists.infradead.org \
--cc=linux-kernel@vger.kernel.org \
--cc=linux-mm@kvack.org \
--cc=martin.lau@linux.dev \
--cc=memxor@gmail.com \
--cc=mingo@redhat.com \
--cc=peterz@infradead.org \
--cc=rppt@kernel.org \
--cc=sched-ext@lists.linux.dev \
--cc=tglx@kernel.org \
--cc=void@manifault.com \
--cc=will@kernel.org \
--cc=x86@kernel.org \
/path/to/YOUR_REPLY
https://kernel.org/pub/software/scm/git/docs/git-send-email.html
* If your mail client supports setting the In-Reply-To header
via mailto: links, try the mailto: link
Be sure your reply has a Subject: header at the top and a blank line
before the message body.
This is an external index of several public inboxes,
see mirroring instructions on how to clone and mirror
all data and code used by this external index.