From: "Emil Tsalapatis" <emil@etsalapatis.com>
To: "Tejun Heo" <tj@kernel.org>, "David Vernet" <void@manifault.com>,
"Andrea Righi" <arighi@nvidia.com>,
"Changwoo Min" <changwoo@igalia.com>,
"Alexei Starovoitov" <ast@kernel.org>,
"Andrii Nakryiko" <andrii@kernel.org>,
"Daniel Borkmann" <daniel@iogearbox.net>,
"Martin KaFai Lau" <martin.lau@linux.dev>,
"Kumar Kartikeya Dwivedi" <memxor@gmail.com>
Cc: "Peter Zijlstra" <peterz@infradead.org>,
"Catalin Marinas" <catalin.marinas@arm.com>,
"Will Deacon" <will@kernel.org>,
"Thomas Gleixner" <tglx@kernel.org>,
"Ingo Molnar" <mingo@redhat.com>,
"Borislav Petkov" <bp@alien8.de>,
"Dave Hansen" <dave.hansen@linux.intel.com>,
"Andrew Morton" <akpm@linux-foundation.org>,
"David Hildenbrand" <david@kernel.org>,
"Mike Rapoport" <rppt@kernel.org>,
"Emil Tsalapatis" <emil@etsalapatis.com>,
<sched-ext@lists.linux.dev>, <bpf@vger.kernel.org>,
<x86@kernel.org>, <linux-arm-kernel@lists.infradead.org>,
<linux-mm@kvack.org>, <linux-kernel@vger.kernel.org>
Subject: Re: [PATCH 8/8] sched_ext: Convert ops.set_cmask() to arena-resident cmask
Date: Thu, 21 May 2026 00:19:30 -0400 [thread overview]
Message-ID: <DIO2DVGYRBUN.1QPC47NPPWTZ9@etsalapatis.com> (raw)
In-Reply-To: <20260520235052.4180316-9-tj@kernel.org>
On Wed May 20, 2026 at 7:50 PM EDT, Tejun Heo wrote:
> ops_cid.set_cmask() expects a cmask. The kernel couldn't write into the
> arena, so it translated cpumask -> cmask in kernel memory and passed the
> result as a trusted pointer. The BPF cmask helpers all operate on arena
> cmasks though, so the BPF side had to word-by-word probe-read the kernel
> cmask into an arena cmask via cmask_copy_from_kernel() before any helper
> could touch it. It works, but is clumsy.
>
> With direct kernel-side arena access now in place, build the cmask in the
> arena. The kernel writes to it through the kern_va side of the dual mapping;
> BPF directly dereferences it via an __arena pointer like any other arena
> struct.
>
> Signed-off-by: Tejun Heo <tj@kernel.org>
Reviewed-by: Emil Tsalapatis <emil@etsalapatis.com>
> ---
> kernel/sched/ext.c | 68 +++++++++++++++++++++++++--
> kernel/sched/ext_cid.c | 20 +-------
> kernel/sched/ext_internal.h | 10 +++-
> tools/sched_ext/include/scx/cid.bpf.h | 52 --------------------
> tools/sched_ext/scx_qmap.bpf.c | 5 +-
> 5 files changed, 75 insertions(+), 80 deletions(-)
>
> diff --git a/kernel/sched/ext.c b/kernel/sched/ext.c
> index fb91079c1244..94562e3350c6 100644
> --- a/kernel/sched/ext.c
> +++ b/kernel/sched/ext.c
> @@ -621,11 +621,16 @@ static inline void scx_call_op_set_cpumask(struct scx_sched *sch, struct rq *rq,
> update_locked_rq(rq);
>
> if (scx_is_cid_type()) {
> - struct scx_cmask *cmask = this_cpu_ptr(scx_set_cmask_scratch);
> -
> - lockdep_assert_irqs_disabled();
> - scx_cpumask_to_cmask(cpumask, cmask);
> - sch->ops_cid.set_cmask(task, cmask);
> + struct scx_cmask *kern_va = *this_cpu_ptr(sch->set_cmask_scratch);
> + unsigned long uaddr = (unsigned long)kern_va -
> + bpf_arena_map_kern_vm_start(sch->arena_map);
> + /*
> + * Build the per-CPU arena cmask and hand BPF the uaddr. Caller
> + * holds the rq lock with IRQs disabled, which makes us the sole
> + * user of the scratch area.
> + */
> + scx_cpumask_to_cmask(cpumask, kern_va);
> + sch->ops_cid.set_cmask(task, (struct scx_cmask *)uaddr);
> } else {
> sch->ops.set_cpumask(task, cpumask);
> }
> @@ -4949,6 +4954,48 @@ static const struct attribute_group scx_global_attr_group = {
> static void free_pnode(struct scx_sched_pnode *pnode);
> static void free_exit_info(struct scx_exit_info *ei);
>
> +static s32 scx_set_cmask_scratch_alloc(struct scx_sched *sch)
> +{
> + size_t size = struct_size_t(struct scx_cmask, bits,
> + SCX_CMASK_NR_WORDS(num_possible_cpus()));
> + int cpu;
> +
> + if (!sch->is_cid_type || !sch->arena_pool)
> + return 0;
> +
> + sch->set_cmask_scratch = alloc_percpu(struct scx_cmask *);
> + if (!sch->set_cmask_scratch)
> + return -ENOMEM;
> +
> + for_each_possible_cpu(cpu) {
> + struct scx_cmask **slot = per_cpu_ptr(sch->set_cmask_scratch, cpu);
> +
> + *slot = scx_arena_alloc(sch, size);
> + if (!*slot)
> + return -ENOMEM;
> + scx_cmask_init(*slot, 0, num_possible_cpus());
> + }
> + return 0;
> +}
> +
> +static void scx_set_cmask_scratch_free(struct scx_sched *sch)
> +{
> + size_t size = struct_size_t(struct scx_cmask, bits,
> + SCX_CMASK_NR_WORDS(num_possible_cpus()));
> + int cpu;
> +
> + if (!sch->set_cmask_scratch)
> + return;
> +
> + for_each_possible_cpu(cpu) {
> + struct scx_cmask **slot = per_cpu_ptr(sch->set_cmask_scratch, cpu);
> +
> + scx_arena_free(sch, *slot, size);
> + }
> + free_percpu(sch->set_cmask_scratch);
> + sch->set_cmask_scratch = NULL;
> +}
> +
> static void scx_sched_free_rcu_work(struct work_struct *work)
> {
> struct rcu_work *rcu_work = to_rcu_work(work);
> @@ -5003,6 +5050,7 @@ static void scx_sched_free_rcu_work(struct work_struct *work)
>
> rhashtable_free_and_destroy(&sch->dsq_hash, NULL, NULL);
> free_exit_info(sch->exit_info);
> + scx_set_cmask_scratch_free(sch);
> scx_arena_pool_destroy(sch);
> if (sch->arena_map)
> bpf_map_put(sch->arena_map);
> @@ -7162,6 +7210,12 @@ static void scx_root_enable_workfn(struct kthread_work *work)
> goto err_disable;
> }
>
> + ret = scx_set_cmask_scratch_alloc(sch);
> + if (ret) {
> + cpus_read_unlock();
> + goto err_disable;
> + }
> +
> for (i = SCX_OPI_CPU_HOTPLUG_BEGIN; i < SCX_OPI_CPU_HOTPLUG_END; i++)
> if (((void (**)(void))ops)[i])
> set_bit(i, sch->has_op);
> @@ -7484,6 +7538,10 @@ static void scx_sub_enable_workfn(struct kthread_work *work)
> if (ret)
> goto err_disable;
>
> + ret = scx_set_cmask_scratch_alloc(sch);
> + if (ret)
> + goto err_disable;
> +
> if (validate_ops(sch, ops))
> goto err_disable;
>
> diff --git a/kernel/sched/ext_cid.c b/kernel/sched/ext_cid.c
> index 0c91b951fd33..808c6390da5a 100644
> --- a/kernel/sched/ext_cid.c
> +++ b/kernel/sched/ext_cid.c
> @@ -7,14 +7,6 @@
> */
> #include <linux/cacheinfo.h>
>
> -/*
> - * Per-cpu scratch cmask used by scx_call_op_set_cpumask() to synthesize a
> - * cmask from a cpumask. Allocated alongside the cid arrays on first enable
> - * and never freed. Sized to the full cid space. Caller holds rq lock so
> - * this_cpu_ptr is safe.
> - */
> -struct scx_cmask __percpu *scx_set_cmask_scratch;
> -
> /*
> * cid tables.
> *
> @@ -54,8 +46,6 @@ static s32 scx_cid_arrays_alloc(void)
> u32 npossible = num_possible_cpus();
> s16 *cid_to_cpu, *cpu_to_cid;
> struct scx_cid_topo *cid_topo;
> - struct scx_cmask __percpu *set_cmask_scratch;
> - s32 cpu;
>
> if (scx_cid_to_cpu_tbl)
> return 0;
> @@ -63,25 +53,17 @@ static s32 scx_cid_arrays_alloc(void)
> cid_to_cpu = kzalloc_objs(*scx_cid_to_cpu_tbl, npossible, GFP_KERNEL);
> cpu_to_cid = kzalloc_objs(*scx_cpu_to_cid_tbl, nr_cpu_ids, GFP_KERNEL);
> cid_topo = kmalloc_objs(*scx_cid_topo, npossible, GFP_KERNEL);
> - set_cmask_scratch = __alloc_percpu(struct_size(set_cmask_scratch, bits,
> - SCX_CMASK_NR_WORDS(npossible)),
> - sizeof(u64));
>
> - if (!cid_to_cpu || !cpu_to_cid || !cid_topo || !set_cmask_scratch) {
> + if (!cid_to_cpu || !cpu_to_cid || !cid_topo) {
> kfree(cid_to_cpu);
> kfree(cpu_to_cid);
> kfree(cid_topo);
> - free_percpu(set_cmask_scratch);
> return -ENOMEM;
> }
>
> WRITE_ONCE(scx_cid_to_cpu_tbl, cid_to_cpu);
> WRITE_ONCE(scx_cpu_to_cid_tbl, cpu_to_cid);
> WRITE_ONCE(scx_cid_topo, cid_topo);
> - for_each_possible_cpu(cpu)
> - scx_cmask_init(per_cpu_ptr(set_cmask_scratch, cpu),
> - 0, npossible);
> - WRITE_ONCE(scx_set_cmask_scratch, set_cmask_scratch);
> return 0;
> }
>
> diff --git a/kernel/sched/ext_internal.h b/kernel/sched/ext_internal.h
> index ff7e882bd67a..9bb65367f510 100644
> --- a/kernel/sched/ext_internal.h
> +++ b/kernel/sched/ext_internal.h
> @@ -1124,6 +1124,14 @@ struct scx_sched {
> struct bpf_map *arena_map;
> struct gen_pool *arena_pool;
>
> + /*
> + * Per-CPU arena cmask used by scx_call_op_set_cpumask() to hand a cmask
> + * to ops_cid.set_cmask(). The kernel writes through the stored kern_va;
> + * the BPF-arena uaddr handed to BPF is recovered by subtracting the
> + * arena's kern_vm_start.
> + */
> + struct scx_cmask * __percpu *set_cmask_scratch;
> +
> DECLARE_BITMAP(has_op, SCX_OPI_END);
>
> /*
> @@ -1480,8 +1488,6 @@ enum scx_ops_state {
> extern struct scx_sched __rcu *scx_root;
> DECLARE_PER_CPU(struct rq *, scx_locked_rq_state);
>
> -extern struct scx_cmask __percpu *scx_set_cmask_scratch;
> -
> /*
> * True when the currently loaded scheduler hierarchy is cid-form. All scheds
> * in a hierarchy share one form, so this single key tells callsites which
> diff --git a/tools/sched_ext/include/scx/cid.bpf.h b/tools/sched_ext/include/scx/cid.bpf.h
> index e281c88fa824..70f2a3829af4 100644
> --- a/tools/sched_ext/include/scx/cid.bpf.h
> +++ b/tools/sched_ext/include/scx/cid.bpf.h
> @@ -675,56 +675,4 @@ static __always_inline void cmask_from_cpumask(struct scx_cmask __arena *m,
> }
> }
>
> -/**
> - * cmask_copy_from_kernel - probe-read a kernel cmask into an arena cmask
> - * @dst: arena cmask to fill; must have @dst->base == 0 and be sized for @src.
> - * @src: kernel-memory cmask (e.g. ops.set_cmask() arg); @src->base must be 0.
> - *
> - * Word-for-word copy; @src and @dst must share base 0 alignment. Triggers
> - * scx_bpf_error() on probe failure or precondition violation.
> - */
> -static __always_inline void cmask_copy_from_kernel(struct scx_cmask __arena *dst,
> - const struct scx_cmask *src)
> -{
> - u32 base = 0, nr_cids = 0, nr_words, wi;
> -
> - if (dst->base != 0) {
> - scx_bpf_error("cmask_copy_from_kernel requires dst->base == 0");
> - return;
> - }
> -
> - if (bpf_probe_read_kernel(&base, sizeof(base), &src->base)) {
> - scx_bpf_error("probe-read cmask->base failed");
> - return;
> - }
> - if (base != 0) {
> - scx_bpf_error("cmask_copy_from_kernel requires src->base == 0");
> - return;
> - }
> -
> - if (bpf_probe_read_kernel(&nr_cids, sizeof(nr_cids), &src->nr_cids)) {
> - scx_bpf_error("probe-read cmask->nr_cids failed");
> - return;
> - }
> -
> - if (nr_cids > dst->nr_cids) {
> - scx_bpf_error("src cmask nr_cids=%u exceeds dst nr_cids=%u",
> - nr_cids, dst->nr_cids);
> - return;
> - }
> -
> - nr_words = CMASK_NR_WORDS(nr_cids);
> - cmask_zero(dst);
> - bpf_for(wi, 0, CMASK_MAX_WORDS) {
> - u64 word = 0;
> - if (wi >= nr_words)
> - break;
> - if (bpf_probe_read_kernel(&word, sizeof(u64), &src->bits[wi])) {
> - scx_bpf_error("probe-read cmask->bits[%u] failed", wi);
> - return;
> - }
> - dst->bits[wi] = word;
> - }
> -}
> -
> #endif /* __SCX_CID_BPF_H */
> diff --git a/tools/sched_ext/scx_qmap.bpf.c b/tools/sched_ext/scx_qmap.bpf.c
> index 7e77f22674ea..8a2d6a8ebd8e 100644
> --- a/tools/sched_ext/scx_qmap.bpf.c
> +++ b/tools/sched_ext/scx_qmap.bpf.c
> @@ -919,14 +919,15 @@ void BPF_STRUCT_OPS(qmap_update_idle, s32 cid, bool idle)
> }
>
> void BPF_STRUCT_OPS(qmap_set_cmask, struct task_struct *p,
> - const struct scx_cmask *cmask)
> + const struct scx_cmask *cmask_in)
> {
> + struct scx_cmask __arena *cmask = (struct scx_cmask __arena *)(long)cmask_in;
> task_ctx_t *taskc;
>
> taskc = lookup_task_ctx(p);
> if (!taskc)
> return;
> - cmask_copy_from_kernel(&taskc->cpus_allowed, cmask);
> + cmask_copy(&taskc->cpus_allowed, cmask);
> }
>
> struct monitor_timer {
next prev parent reply other threads:[~2026-05-21 4:19 UTC|newest]
Thread overview: 19+ messages / expand[flat|nested] mbox.gz Atom feed top
2026-05-20 23:50 [PATCHSET v3 sched_ext/for-7.2] bpf/arena: Direct kernel-side access Tejun Heo
2026-05-20 23:50 ` [PATCH 1/8] mm: Add ptep_try_set() for lockless empty-slot installs Tejun Heo
2026-05-21 7:00 ` Andrea Righi
2026-05-20 23:50 ` [PATCH 2/8] bpf: Recover arena kernel faults with scratch page Tejun Heo
2026-05-21 3:16 ` Emil Tsalapatis
2026-05-21 9:42 ` Alexei Starovoitov
2026-05-20 23:50 ` [PATCH 3/8] bpf: Add sleepable variant of bpf_arena_alloc_pages for kernel callers Tejun Heo
2026-05-21 3:17 ` Emil Tsalapatis
2026-05-20 23:50 ` [PATCH 4/8] bpf: Add bpf_struct_ops_for_each_prog() Tejun Heo
2026-05-21 4:07 ` Emil Tsalapatis
2026-05-20 23:50 ` [PATCH 5/8] bpf/arena: Add bpf_arena_map_kern_vm_start() and bpf_prog_arena() Tejun Heo
2026-05-21 4:08 ` Emil Tsalapatis
2026-05-20 23:50 ` [PATCH 6/8] sched_ext: Require an arena for cid-form schedulers Tejun Heo
2026-05-21 4:15 ` Emil Tsalapatis
2026-05-20 23:50 ` [PATCH 7/8] sched_ext: Sub-allocator over kernel-claimed BPF arena pages Tejun Heo
2026-05-21 7:56 ` Andrea Righi
2026-05-20 23:50 ` [PATCH 8/8] sched_ext: Convert ops.set_cmask() to arena-resident cmask Tejun Heo
2026-05-21 4:19 ` Emil Tsalapatis [this message]
-- strict thread matches above, loose matches on Subject: below --
2026-05-17 21:12 [PATCHSET v2 sched_ext/for-7.2] bpf/arena: Direct kernel-side access Tejun Heo
2026-05-17 21:12 ` [PATCH 8/8] sched_ext: Convert ops.set_cmask() to arena-resident cmask Tejun Heo
Reply instructions:
You may reply publicly to this message via plain-text email
using any one of the following methods:
* Save the following mbox file, import it into your mail client,
and reply-to-all from there: mbox
Avoid top-posting and favor interleaved quoting:
https://en.wikipedia.org/wiki/Posting_style#Interleaved_style
* Reply using the --to, --cc, and --in-reply-to
switches of git-send-email(1):
git send-email \
--in-reply-to=DIO2DVGYRBUN.1QPC47NPPWTZ9@etsalapatis.com \
--to=emil@etsalapatis.com \
--cc=akpm@linux-foundation.org \
--cc=andrii@kernel.org \
--cc=arighi@nvidia.com \
--cc=ast@kernel.org \
--cc=bp@alien8.de \
--cc=bpf@vger.kernel.org \
--cc=catalin.marinas@arm.com \
--cc=changwoo@igalia.com \
--cc=daniel@iogearbox.net \
--cc=dave.hansen@linux.intel.com \
--cc=david@kernel.org \
--cc=linux-arm-kernel@lists.infradead.org \
--cc=linux-kernel@vger.kernel.org \
--cc=linux-mm@kvack.org \
--cc=martin.lau@linux.dev \
--cc=memxor@gmail.com \
--cc=mingo@redhat.com \
--cc=peterz@infradead.org \
--cc=rppt@kernel.org \
--cc=sched-ext@lists.linux.dev \
--cc=tglx@kernel.org \
--cc=tj@kernel.org \
--cc=void@manifault.com \
--cc=will@kernel.org \
--cc=x86@kernel.org \
/path/to/YOUR_REPLY
https://kernel.org/pub/software/scm/git/docs/git-send-email.html
* If your mail client supports setting the In-Reply-To header
via mailto: links, try the mailto: link
Be sure your reply has a Subject: header at the top and a blank line
before the message body.
This is a public inbox, see mirroring instructions
for how to clone and mirror all data and code used for this inbox