Linux-ARM-Kernel Archive on lore.kernel.org
 help / color / mirror / Atom feed
From: "Emil Tsalapatis" <emil@etsalapatis.com>
To: "Tejun Heo" <tj@kernel.org>, "David Vernet" <void@manifault.com>,
	"Andrea Righi" <arighi@nvidia.com>,
	"Changwoo Min" <changwoo@igalia.com>,
	"Alexei Starovoitov" <ast@kernel.org>,
	"Andrii Nakryiko" <andrii@kernel.org>,
	"Daniel Borkmann" <daniel@iogearbox.net>,
	"Martin KaFai Lau" <martin.lau@linux.dev>,
	"Kumar Kartikeya Dwivedi" <memxor@gmail.com>
Cc: "Peter Zijlstra" <peterz@infradead.org>,
	"Catalin Marinas" <catalin.marinas@arm.com>,
	"Will Deacon" <will@kernel.org>,
	"Thomas Gleixner" <tglx@kernel.org>,
	"Ingo Molnar" <mingo@redhat.com>,
	"Borislav Petkov" <bp@alien8.de>,
	"Dave Hansen" <dave.hansen@linux.intel.com>,
	"Andrew Morton" <akpm@linux-foundation.org>,
	"David Hildenbrand" <david@kernel.org>,
	"Mike Rapoport" <rppt@kernel.org>,
	"Emil Tsalapatis" <emil@etsalapatis.com>,
	<sched-ext@lists.linux.dev>, <bpf@vger.kernel.org>,
	<x86@kernel.org>, <linux-arm-kernel@lists.infradead.org>,
	<linux-mm@kvack.org>, <linux-kernel@vger.kernel.org>
Subject: Re: [PATCH 8/8] sched_ext: Convert ops.set_cmask() to arena-resident cmask
Date: Thu, 21 May 2026 00:19:30 -0400	[thread overview]
Message-ID: <DIO2DVGYRBUN.1QPC47NPPWTZ9@etsalapatis.com> (raw)
In-Reply-To: <20260520235052.4180316-9-tj@kernel.org>

On Wed May 20, 2026 at 7:50 PM EDT, Tejun Heo wrote:
> ops_cid.set_cmask() expects a cmask. The kernel couldn't write into the
> arena, so it translated cpumask -> cmask in kernel memory and passed the
> result as a trusted pointer. The BPF cmask helpers all operate on arena
> cmasks though, so the BPF side had to word-by-word probe-read the kernel
> cmask into an arena cmask via cmask_copy_from_kernel() before any helper
> could touch it. It works, but is clumsy.
>
> With direct kernel-side arena access now in place, build the cmask in the
> arena. The kernel writes to it through the kern_va side of the dual mapping;
> BPF directly dereferences it via an __arena pointer like any other arena
> struct.
>
> Signed-off-by: Tejun Heo <tj@kernel.org>

Reviewed-by: Emil Tsalapatis <emil@etsalapatis.com>

> ---
>  kernel/sched/ext.c                    | 68 +++++++++++++++++++++++++--
>  kernel/sched/ext_cid.c                | 20 +-------
>  kernel/sched/ext_internal.h           | 10 +++-
>  tools/sched_ext/include/scx/cid.bpf.h | 52 --------------------
>  tools/sched_ext/scx_qmap.bpf.c        |  5 +-
>  5 files changed, 75 insertions(+), 80 deletions(-)
>
> diff --git a/kernel/sched/ext.c b/kernel/sched/ext.c
> index fb91079c1244..94562e3350c6 100644
> --- a/kernel/sched/ext.c
> +++ b/kernel/sched/ext.c
> @@ -621,11 +621,16 @@ static inline void scx_call_op_set_cpumask(struct scx_sched *sch, struct rq *rq,
>  		update_locked_rq(rq);
>  
>  	if (scx_is_cid_type()) {
> -		struct scx_cmask *cmask = this_cpu_ptr(scx_set_cmask_scratch);
> -
> -		lockdep_assert_irqs_disabled();
> -		scx_cpumask_to_cmask(cpumask, cmask);
> -		sch->ops_cid.set_cmask(task, cmask);
> +		struct scx_cmask *kern_va = *this_cpu_ptr(sch->set_cmask_scratch);
> +		unsigned long uaddr = (unsigned long)kern_va -
> +			bpf_arena_map_kern_vm_start(sch->arena_map);
> +		/*
> +		 * Build the per-CPU arena cmask and hand BPF the uaddr. Caller
> +		 * holds the rq lock with IRQs disabled, which makes us the sole
> +		 * user of the scratch area.
> +		 */
> +		scx_cpumask_to_cmask(cpumask, kern_va);
> +		sch->ops_cid.set_cmask(task, (struct scx_cmask *)uaddr);
>  	} else {
>  		sch->ops.set_cpumask(task, cpumask);
>  	}
> @@ -4949,6 +4954,48 @@ static const struct attribute_group scx_global_attr_group = {
>  static void free_pnode(struct scx_sched_pnode *pnode);
>  static void free_exit_info(struct scx_exit_info *ei);
>  
> +static s32 scx_set_cmask_scratch_alloc(struct scx_sched *sch)
> +{
> +	size_t size = struct_size_t(struct scx_cmask, bits,
> +				    SCX_CMASK_NR_WORDS(num_possible_cpus()));
> +	int cpu;
> +
> +	if (!sch->is_cid_type || !sch->arena_pool)
> +		return 0;
> +
> +	sch->set_cmask_scratch = alloc_percpu(struct scx_cmask *);
> +	if (!sch->set_cmask_scratch)
> +		return -ENOMEM;
> +
> +	for_each_possible_cpu(cpu) {
> +		struct scx_cmask **slot = per_cpu_ptr(sch->set_cmask_scratch, cpu);
> +
> +		*slot = scx_arena_alloc(sch, size);
> +		if (!*slot)
> +			return -ENOMEM;
> +		scx_cmask_init(*slot, 0, num_possible_cpus());
> +	}
> +	return 0;
> +}
> +
> +static void scx_set_cmask_scratch_free(struct scx_sched *sch)
> +{
> +	size_t size = struct_size_t(struct scx_cmask, bits,
> +				    SCX_CMASK_NR_WORDS(num_possible_cpus()));
> +	int cpu;
> +
> +	if (!sch->set_cmask_scratch)
> +		return;
> +
> +	for_each_possible_cpu(cpu) {
> +		struct scx_cmask **slot = per_cpu_ptr(sch->set_cmask_scratch, cpu);
> +
> +		scx_arena_free(sch, *slot, size);
> +	}
> +	free_percpu(sch->set_cmask_scratch);
> +	sch->set_cmask_scratch = NULL;
> +}
> +
>  static void scx_sched_free_rcu_work(struct work_struct *work)
>  {
>  	struct rcu_work *rcu_work = to_rcu_work(work);
> @@ -5003,6 +5050,7 @@ static void scx_sched_free_rcu_work(struct work_struct *work)
>  
>  	rhashtable_free_and_destroy(&sch->dsq_hash, NULL, NULL);
>  	free_exit_info(sch->exit_info);
> +	scx_set_cmask_scratch_free(sch);
>  	scx_arena_pool_destroy(sch);
>  	if (sch->arena_map)
>  		bpf_map_put(sch->arena_map);
> @@ -7162,6 +7210,12 @@ static void scx_root_enable_workfn(struct kthread_work *work)
>  		goto err_disable;
>  	}
>  
> +	ret = scx_set_cmask_scratch_alloc(sch);
> +	if (ret) {
> +		cpus_read_unlock();
> +		goto err_disable;
> +	}
> +
>  	for (i = SCX_OPI_CPU_HOTPLUG_BEGIN; i < SCX_OPI_CPU_HOTPLUG_END; i++)
>  		if (((void (**)(void))ops)[i])
>  			set_bit(i, sch->has_op);
> @@ -7484,6 +7538,10 @@ static void scx_sub_enable_workfn(struct kthread_work *work)
>  	if (ret)
>  		goto err_disable;
>  
> +	ret = scx_set_cmask_scratch_alloc(sch);
> +	if (ret)
> +		goto err_disable;
> +
>  	if (validate_ops(sch, ops))
>  		goto err_disable;
>  
> diff --git a/kernel/sched/ext_cid.c b/kernel/sched/ext_cid.c
> index 0c91b951fd33..808c6390da5a 100644
> --- a/kernel/sched/ext_cid.c
> +++ b/kernel/sched/ext_cid.c
> @@ -7,14 +7,6 @@
>   */
>  #include <linux/cacheinfo.h>
>  
> -/*
> - * Per-cpu scratch cmask used by scx_call_op_set_cpumask() to synthesize a
> - * cmask from a cpumask. Allocated alongside the cid arrays on first enable
> - * and never freed. Sized to the full cid space. Caller holds rq lock so
> - * this_cpu_ptr is safe.
> - */
> -struct scx_cmask __percpu *scx_set_cmask_scratch;
> -
>  /*
>   * cid tables.
>   *
> @@ -54,8 +46,6 @@ static s32 scx_cid_arrays_alloc(void)
>  	u32 npossible = num_possible_cpus();
>  	s16 *cid_to_cpu, *cpu_to_cid;
>  	struct scx_cid_topo *cid_topo;
> -	struct scx_cmask __percpu *set_cmask_scratch;
> -	s32 cpu;
>  
>  	if (scx_cid_to_cpu_tbl)
>  		return 0;
> @@ -63,25 +53,17 @@ static s32 scx_cid_arrays_alloc(void)
>  	cid_to_cpu = kzalloc_objs(*scx_cid_to_cpu_tbl, npossible, GFP_KERNEL);
>  	cpu_to_cid = kzalloc_objs(*scx_cpu_to_cid_tbl, nr_cpu_ids, GFP_KERNEL);
>  	cid_topo = kmalloc_objs(*scx_cid_topo, npossible, GFP_KERNEL);
> -	set_cmask_scratch = __alloc_percpu(struct_size(set_cmask_scratch, bits,
> -						       SCX_CMASK_NR_WORDS(npossible)),
> -					   sizeof(u64));
>  
> -	if (!cid_to_cpu || !cpu_to_cid || !cid_topo || !set_cmask_scratch) {
> +	if (!cid_to_cpu || !cpu_to_cid || !cid_topo) {
>  		kfree(cid_to_cpu);
>  		kfree(cpu_to_cid);
>  		kfree(cid_topo);
> -		free_percpu(set_cmask_scratch);
>  		return -ENOMEM;
>  	}
>  
>  	WRITE_ONCE(scx_cid_to_cpu_tbl, cid_to_cpu);
>  	WRITE_ONCE(scx_cpu_to_cid_tbl, cpu_to_cid);
>  	WRITE_ONCE(scx_cid_topo, cid_topo);
> -	for_each_possible_cpu(cpu)
> -		scx_cmask_init(per_cpu_ptr(set_cmask_scratch, cpu),
> -			       0, npossible);
> -	WRITE_ONCE(scx_set_cmask_scratch, set_cmask_scratch);
>  	return 0;
>  }
>  
> diff --git a/kernel/sched/ext_internal.h b/kernel/sched/ext_internal.h
> index ff7e882bd67a..9bb65367f510 100644
> --- a/kernel/sched/ext_internal.h
> +++ b/kernel/sched/ext_internal.h
> @@ -1124,6 +1124,14 @@ struct scx_sched {
>  	struct bpf_map		*arena_map;
>  	struct gen_pool		*arena_pool;
>  
> +	/*
> +	 * Per-CPU arena cmask used by scx_call_op_set_cpumask() to hand a cmask
> +	 * to ops_cid.set_cmask(). The kernel writes through the stored kern_va;
> +	 * the BPF-arena uaddr handed to BPF is recovered by subtracting the
> +	 * arena's kern_vm_start.
> +	 */
> +	struct scx_cmask * __percpu *set_cmask_scratch;
> +
>  	DECLARE_BITMAP(has_op, SCX_OPI_END);
>  
>  	/*
> @@ -1480,8 +1488,6 @@ enum scx_ops_state {
>  extern struct scx_sched __rcu *scx_root;
>  DECLARE_PER_CPU(struct rq *, scx_locked_rq_state);
>  
> -extern struct scx_cmask __percpu *scx_set_cmask_scratch;
> -
>  /*
>   * True when the currently loaded scheduler hierarchy is cid-form. All scheds
>   * in a hierarchy share one form, so this single key tells callsites which
> diff --git a/tools/sched_ext/include/scx/cid.bpf.h b/tools/sched_ext/include/scx/cid.bpf.h
> index e281c88fa824..70f2a3829af4 100644
> --- a/tools/sched_ext/include/scx/cid.bpf.h
> +++ b/tools/sched_ext/include/scx/cid.bpf.h
> @@ -675,56 +675,4 @@ static __always_inline void cmask_from_cpumask(struct scx_cmask __arena *m,
>  	}
>  }
>  
> -/**
> - * cmask_copy_from_kernel - probe-read a kernel cmask into an arena cmask
> - * @dst: arena cmask to fill; must have @dst->base == 0 and be sized for @src.
> - * @src: kernel-memory cmask (e.g. ops.set_cmask() arg); @src->base must be 0.
> - *
> - * Word-for-word copy; @src and @dst must share base 0 alignment. Triggers
> - * scx_bpf_error() on probe failure or precondition violation.
> - */
> -static __always_inline void cmask_copy_from_kernel(struct scx_cmask __arena *dst,
> -						   const struct scx_cmask *src)
> -{
> -	u32 base = 0, nr_cids = 0, nr_words, wi;
> -
> -	if (dst->base != 0) {
> -		scx_bpf_error("cmask_copy_from_kernel requires dst->base == 0");
> -		return;
> -	}
> -
> -	if (bpf_probe_read_kernel(&base, sizeof(base), &src->base)) {
> -		scx_bpf_error("probe-read cmask->base failed");
> -		return;
> -	}
> -	if (base != 0) {
> -		scx_bpf_error("cmask_copy_from_kernel requires src->base == 0");
> -		return;
> -	}
> -
> -	if (bpf_probe_read_kernel(&nr_cids, sizeof(nr_cids), &src->nr_cids)) {
> -		scx_bpf_error("probe-read cmask->nr_cids failed");
> -		return;
> -	}
> -
> -	if (nr_cids > dst->nr_cids) {
> -		scx_bpf_error("src cmask nr_cids=%u exceeds dst nr_cids=%u",
> -			      nr_cids, dst->nr_cids);
> -		return;
> -	}
> -
> -	nr_words = CMASK_NR_WORDS(nr_cids);
> -	cmask_zero(dst);
> -	bpf_for(wi, 0, CMASK_MAX_WORDS) {
> -		u64 word = 0;
> -		if (wi >= nr_words)
> -			break;
> -		if (bpf_probe_read_kernel(&word, sizeof(u64), &src->bits[wi])) {
> -			scx_bpf_error("probe-read cmask->bits[%u] failed", wi);
> -			return;
> -		}
> -		dst->bits[wi] = word;
> -	}
> -}
> -
>  #endif /* __SCX_CID_BPF_H */
> diff --git a/tools/sched_ext/scx_qmap.bpf.c b/tools/sched_ext/scx_qmap.bpf.c
> index 7e77f22674ea..8a2d6a8ebd8e 100644
> --- a/tools/sched_ext/scx_qmap.bpf.c
> +++ b/tools/sched_ext/scx_qmap.bpf.c
> @@ -919,14 +919,15 @@ void BPF_STRUCT_OPS(qmap_update_idle, s32 cid, bool idle)
>  }
>  
>  void BPF_STRUCT_OPS(qmap_set_cmask, struct task_struct *p,
> -		    const struct scx_cmask *cmask)
> +		    const struct scx_cmask *cmask_in)
>  {
> +	struct scx_cmask __arena *cmask = (struct scx_cmask __arena *)(long)cmask_in;
>  	task_ctx_t *taskc;
>  
>  	taskc = lookup_task_ctx(p);
>  	if (!taskc)
>  		return;
> -	cmask_copy_from_kernel(&taskc->cpus_allowed, cmask);
> +	cmask_copy(&taskc->cpus_allowed, cmask);
>  }
>  
>  struct monitor_timer {



  reply	other threads:[~2026-05-21  4:19 UTC|newest]

Thread overview: 19+ messages / expand[flat|nested]  mbox.gz  Atom feed  top
2026-05-20 23:50 [PATCHSET v3 sched_ext/for-7.2] bpf/arena: Direct kernel-side access Tejun Heo
2026-05-20 23:50 ` [PATCH 1/8] mm: Add ptep_try_set() for lockless empty-slot installs Tejun Heo
2026-05-21  7:00   ` Andrea Righi
2026-05-20 23:50 ` [PATCH 2/8] bpf: Recover arena kernel faults with scratch page Tejun Heo
2026-05-21  3:16   ` Emil Tsalapatis
2026-05-21  9:42   ` Alexei Starovoitov
2026-05-20 23:50 ` [PATCH 3/8] bpf: Add sleepable variant of bpf_arena_alloc_pages for kernel callers Tejun Heo
2026-05-21  3:17   ` Emil Tsalapatis
2026-05-20 23:50 ` [PATCH 4/8] bpf: Add bpf_struct_ops_for_each_prog() Tejun Heo
2026-05-21  4:07   ` Emil Tsalapatis
2026-05-20 23:50 ` [PATCH 5/8] bpf/arena: Add bpf_arena_map_kern_vm_start() and bpf_prog_arena() Tejun Heo
2026-05-21  4:08   ` Emil Tsalapatis
2026-05-20 23:50 ` [PATCH 6/8] sched_ext: Require an arena for cid-form schedulers Tejun Heo
2026-05-21  4:15   ` Emil Tsalapatis
2026-05-20 23:50 ` [PATCH 7/8] sched_ext: Sub-allocator over kernel-claimed BPF arena pages Tejun Heo
2026-05-21  7:56   ` Andrea Righi
2026-05-20 23:50 ` [PATCH 8/8] sched_ext: Convert ops.set_cmask() to arena-resident cmask Tejun Heo
2026-05-21  4:19   ` Emil Tsalapatis [this message]
  -- strict thread matches above, loose matches on Subject: below --
2026-05-17 21:12 [PATCHSET v2 sched_ext/for-7.2] bpf/arena: Direct kernel-side access Tejun Heo
2026-05-17 21:12 ` [PATCH 8/8] sched_ext: Convert ops.set_cmask() to arena-resident cmask Tejun Heo

Reply instructions:

You may reply publicly to this message via plain-text email
using any one of the following methods:

* Save the following mbox file, import it into your mail client,
  and reply-to-all from there: mbox

  Avoid top-posting and favor interleaved quoting:
  https://en.wikipedia.org/wiki/Posting_style#Interleaved_style

* Reply using the --to, --cc, and --in-reply-to
  switches of git-send-email(1):

  git send-email \
    --in-reply-to=DIO2DVGYRBUN.1QPC47NPPWTZ9@etsalapatis.com \
    --to=emil@etsalapatis.com \
    --cc=akpm@linux-foundation.org \
    --cc=andrii@kernel.org \
    --cc=arighi@nvidia.com \
    --cc=ast@kernel.org \
    --cc=bp@alien8.de \
    --cc=bpf@vger.kernel.org \
    --cc=catalin.marinas@arm.com \
    --cc=changwoo@igalia.com \
    --cc=daniel@iogearbox.net \
    --cc=dave.hansen@linux.intel.com \
    --cc=david@kernel.org \
    --cc=linux-arm-kernel@lists.infradead.org \
    --cc=linux-kernel@vger.kernel.org \
    --cc=linux-mm@kvack.org \
    --cc=martin.lau@linux.dev \
    --cc=memxor@gmail.com \
    --cc=mingo@redhat.com \
    --cc=peterz@infradead.org \
    --cc=rppt@kernel.org \
    --cc=sched-ext@lists.linux.dev \
    --cc=tglx@kernel.org \
    --cc=tj@kernel.org \
    --cc=void@manifault.com \
    --cc=will@kernel.org \
    --cc=x86@kernel.org \
    /path/to/YOUR_REPLY

  https://kernel.org/pub/software/scm/git/docs/git-send-email.html

* If your mail client supports setting the In-Reply-To header
  via mailto: links, try the mailto: link
Be sure your reply has a Subject: header at the top and a blank line before the message body.
This is a public inbox, see mirroring instructions
for how to clone and mirror all data and code used for this inbox