[PATCH 8/8] sched_ext: Convert ops.set_cmask() to arena-resident cmask

Linux-ARM-Kernel Archive on lore.kernel.org
 help / color / mirror / Atom feed

* [PATCH 8/8] sched_ext: Convert ops.set_cmask() to arena-resident cmask
  2026-05-17 21:12 [PATCHSET v2 sched_ext/for-7.2] bpf/arena: Direct kernel-side access Tejun Heo
@ 2026-05-17 21:12 ` Tejun Heo
  0 siblings, 0 replies; 18+ messages in thread
From: Tejun Heo @ 2026-05-17 21:12 UTC (permalink / raw)
  To: David Vernet, Andrea Righi, Changwoo Min, Alexei Starovoitov,
	Andrii Nakryiko, Daniel Borkmann, Martin KaFai Lau,
	Kumar Kartikeya Dwivedi
  Cc: Catalin Marinas, Will Deacon, Thomas Gleixner, Ingo Molnar,
	Borislav Petkov, Dave Hansen, Andrew Morton, David Hildenbrand,
	Mike Rapoport, Emil Tsalapatis, sched-ext, bpf, x86,
	linux-arm-kernel, linux-mm, linux-kernel, Tejun Heo

ops_cid.set_cmask() expects a cmask. The kernel couldn't write into the
arena, so it translated cpumask -> cmask in kernel memory and passed the
result as a trusted pointer. The BPF cmask helpers all operate on arena
cmasks though, so the BPF side had to word-by-word probe-read the kernel
cmask into an arena cmask via cmask_copy_from_kernel() before any helper
could touch it. It works, but is clumsy.

With direct kernel-side arena access now in place, build the cmask in the
arena. The kernel writes to it through the kern_va side of the dual mapping;
BPF directly dereferences it via an __arena pointer like any other arena
struct.

Signed-off-by: Tejun Heo <tj@kernel.org>
---
 kernel/sched/ext.c                    | 68 +++++++++++++++++++++++++--
 kernel/sched/ext_cid.c                | 20 +-------
 kernel/sched/ext_internal.h           | 10 +++-
 tools/sched_ext/include/scx/cid.bpf.h | 52 --------------------
 tools/sched_ext/scx_qmap.bpf.c        |  5 +-
 5 files changed, 75 insertions(+), 80 deletions(-)

diff --git a/kernel/sched/ext.c b/kernel/sched/ext.c
index 3025fbe198d3..1369dc7e4b4e 100644
--- a/kernel/sched/ext.c
+++ b/kernel/sched/ext.c
@@ -621,11 +621,16 @@ static inline void scx_call_op_set_cpumask(struct scx_sched *sch, struct rq *rq,
 		update_locked_rq(rq);
 
 	if (scx_is_cid_type()) {
-		struct scx_cmask *cmask = this_cpu_ptr(scx_set_cmask_scratch);
-
-		lockdep_assert_irqs_disabled();
-		scx_cpumask_to_cmask(cpumask, cmask);
-		sch->ops_cid.set_cmask(task, cmask);
+		struct scx_cmask *kern_va = *this_cpu_ptr(sch->set_cmask_scratch);
+		unsigned long uaddr = (unsigned long)kern_va -
+			bpf_arena_map_kern_vm_start(sch->arena_map);
+		/*
+		 * Build the per-CPU arena cmask and hand BPF the uaddr. Caller
+		 * holds the rq lock with IRQs disabled, which makes us the sole
+		 * user of the scratch area.
+		 */
+		scx_cpumask_to_cmask(cpumask, kern_va);
+		sch->ops_cid.set_cmask(task, (struct scx_cmask *)uaddr);
 	} else {
 		sch->ops.set_cpumask(task, cpumask);
 	}
@@ -4957,6 +4962,48 @@ static const struct attribute_group scx_global_attr_group = {
 static void free_pnode(struct scx_sched_pnode *pnode);
 static void free_exit_info(struct scx_exit_info *ei);
 
+static s32 scx_set_cmask_scratch_alloc(struct scx_sched *sch)
+{
+	size_t size = struct_size_t(struct scx_cmask, bits,
+				    SCX_CMASK_NR_WORDS(num_possible_cpus()));
+	int cpu;
+
+	if (!sch->is_cid_type || !sch->arena_pool)
+		return 0;
+
+	sch->set_cmask_scratch = alloc_percpu(struct scx_cmask *);
+	if (!sch->set_cmask_scratch)
+		return -ENOMEM;
+
+	for_each_possible_cpu(cpu) {
+		struct scx_cmask **slot = per_cpu_ptr(sch->set_cmask_scratch, cpu);
+
+		*slot = scx_arena_alloc(sch, size);
+		if (!*slot)
+			return -ENOMEM;
+		scx_cmask_init(*slot, 0, num_possible_cpus());
+	}
+	return 0;
+}
+
+static void scx_set_cmask_scratch_free(struct scx_sched *sch)
+{
+	size_t size = struct_size_t(struct scx_cmask, bits,
+				    SCX_CMASK_NR_WORDS(num_possible_cpus()));
+	int cpu;
+
+	if (!sch->set_cmask_scratch)
+		return;
+
+	for_each_possible_cpu(cpu) {
+		struct scx_cmask **slot = per_cpu_ptr(sch->set_cmask_scratch, cpu);
+
+		scx_arena_free(sch, *slot, size);
+	}
+	free_percpu(sch->set_cmask_scratch);
+	sch->set_cmask_scratch = NULL;
+}
+
 static void scx_sched_free_rcu_work(struct work_struct *work)
 {
 	struct rcu_work *rcu_work = to_rcu_work(work);
@@ -5009,6 +5056,7 @@ static void scx_sched_free_rcu_work(struct work_struct *work)
 
 	rhashtable_free_and_destroy(&sch->dsq_hash, NULL, NULL);
 	free_exit_info(sch->exit_info);
+	scx_set_cmask_scratch_free(sch);
 	scx_arena_pool_destroy(sch);
 	if (sch->arena_map)
 		bpf_map_put(sch->arena_map);
@@ -7147,6 +7195,12 @@ static void scx_root_enable_workfn(struct kthread_work *work)
 		goto err_disable;
 	}
 
+	ret = scx_set_cmask_scratch_alloc(sch);
+	if (ret) {
+		cpus_read_unlock();
+		goto err_disable;
+	}
+
 	for (i = SCX_OPI_CPU_HOTPLUG_BEGIN; i < SCX_OPI_CPU_HOTPLUG_END; i++)
 		if (((void (**)(void))ops)[i])
 			set_bit(i, sch->has_op);
@@ -7469,6 +7523,10 @@ static void scx_sub_enable_workfn(struct kthread_work *work)
 	if (ret)
 		goto err_disable;
 
+	ret = scx_set_cmask_scratch_alloc(sch);
+	if (ret)
+		goto err_disable;
+
 	if (validate_ops(sch, ops))
 		goto err_disable;
 
diff --git a/kernel/sched/ext_cid.c b/kernel/sched/ext_cid.c
index 5cd14143f88f..245a39e2e5eb 100644
--- a/kernel/sched/ext_cid.c
+++ b/kernel/sched/ext_cid.c
@@ -7,14 +7,6 @@
  */
 #include <linux/cacheinfo.h>
 
-/*
- * Per-cpu scratch cmask used by scx_call_op_set_cpumask() to synthesize a
- * cmask from a cpumask. Allocated alongside the cid arrays on first enable
- * and never freed. Sized to the full cid space. Caller holds rq lock so
- * this_cpu_ptr is safe.
- */
-struct scx_cmask __percpu *scx_set_cmask_scratch;
-
 /*
  * cid tables.
  *
@@ -54,8 +46,6 @@ static s32 scx_cid_arrays_alloc(void)
 	u32 npossible = num_possible_cpus();
 	s16 *cid_to_cpu, *cpu_to_cid;
 	struct scx_cid_topo *cid_topo;
-	struct scx_cmask __percpu *set_cmask_scratch;
-	s32 cpu;
 
 	if (scx_cid_to_cpu_tbl)
 		return 0;
@@ -63,25 +53,17 @@ static s32 scx_cid_arrays_alloc(void)
 	cid_to_cpu = kzalloc_objs(*scx_cid_to_cpu_tbl, npossible, GFP_KERNEL);
 	cpu_to_cid = kzalloc_objs(*scx_cpu_to_cid_tbl, nr_cpu_ids, GFP_KERNEL);
 	cid_topo = kmalloc_objs(*scx_cid_topo, npossible, GFP_KERNEL);
-	set_cmask_scratch = __alloc_percpu(struct_size(set_cmask_scratch, bits,
-						       SCX_CMASK_NR_WORDS(npossible)),
-					   sizeof(u64));
 
-	if (!cid_to_cpu || !cpu_to_cid || !cid_topo || !set_cmask_scratch) {
+	if (!cid_to_cpu || !cpu_to_cid || !cid_topo) {
 		kfree(cid_to_cpu);
 		kfree(cpu_to_cid);
 		kfree(cid_topo);
-		free_percpu(set_cmask_scratch);
 		return -ENOMEM;
 	}
 
 	WRITE_ONCE(scx_cid_to_cpu_tbl, cid_to_cpu);
 	WRITE_ONCE(scx_cpu_to_cid_tbl, cpu_to_cid);
 	WRITE_ONCE(scx_cid_topo, cid_topo);
-	for_each_possible_cpu(cpu)
-		scx_cmask_init(per_cpu_ptr(set_cmask_scratch, cpu),
-			       0, npossible);
-	WRITE_ONCE(scx_set_cmask_scratch, set_cmask_scratch);
 	return 0;
 }
 
diff --git a/kernel/sched/ext_internal.h b/kernel/sched/ext_internal.h
index ff7e882bd67a..9bb65367f510 100644
--- a/kernel/sched/ext_internal.h
+++ b/kernel/sched/ext_internal.h
@@ -1124,6 +1124,14 @@ struct scx_sched {
 	struct bpf_map		*arena_map;
 	struct gen_pool		*arena_pool;
 
+	/*
+	 * Per-CPU arena cmask used by scx_call_op_set_cpumask() to hand a cmask
+	 * to ops_cid.set_cmask(). The kernel writes through the stored kern_va;
+	 * the BPF-arena uaddr handed to BPF is recovered by subtracting the
+	 * arena's kern_vm_start.
+	 */
+	struct scx_cmask * __percpu *set_cmask_scratch;
+
 	DECLARE_BITMAP(has_op, SCX_OPI_END);
 
 	/*
@@ -1480,8 +1488,6 @@ enum scx_ops_state {
 extern struct scx_sched __rcu *scx_root;
 DECLARE_PER_CPU(struct rq *, scx_locked_rq_state);
 
-extern struct scx_cmask __percpu *scx_set_cmask_scratch;
-
 /*
  * True when the currently loaded scheduler hierarchy is cid-form. All scheds
  * in a hierarchy share one form, so this single key tells callsites which
diff --git a/tools/sched_ext/include/scx/cid.bpf.h b/tools/sched_ext/include/scx/cid.bpf.h
index 257d8bdca966..875003f04bdc 100644
--- a/tools/sched_ext/include/scx/cid.bpf.h
+++ b/tools/sched_ext/include/scx/cid.bpf.h
@@ -669,56 +669,4 @@ static __always_inline void cmask_from_cpumask(struct scx_cmask __arena *m,
 	}
 }
 
-/**
- * cmask_copy_from_kernel - probe-read a kernel cmask into an arena cmask
- * @dst: arena cmask to fill; must have @dst->base == 0 and be sized for @src.
- * @src: kernel-memory cmask (e.g. ops.set_cmask() arg); @src->base must be 0.
- *
- * Word-for-word copy; @src and @dst must share base 0 alignment. Triggers
- * scx_bpf_error() on probe failure or precondition violation.
- */
-static __always_inline void cmask_copy_from_kernel(struct scx_cmask __arena *dst,
-						   const struct scx_cmask *src)
-{
-	u32 base = 0, nr_cids = 0, nr_words, wi;
-
-	if (dst->base != 0) {
-		scx_bpf_error("cmask_copy_from_kernel requires dst->base == 0");
-		return;
-	}
-
-	if (bpf_probe_read_kernel(&base, sizeof(base), &src->base)) {
-		scx_bpf_error("probe-read cmask->base failed");
-		return;
-	}
-	if (base != 0) {
-		scx_bpf_error("cmask_copy_from_kernel requires src->base == 0");
-		return;
-	}
-
-	if (bpf_probe_read_kernel(&nr_cids, sizeof(nr_cids), &src->nr_cids)) {
-		scx_bpf_error("probe-read cmask->nr_cids failed");
-		return;
-	}
-
-	if (nr_cids > dst->nr_cids) {
-		scx_bpf_error("src cmask nr_cids=%u exceeds dst nr_cids=%u",
-			      nr_cids, dst->nr_cids);
-		return;
-	}
-
-	nr_words = CMASK_NR_WORDS(nr_cids);
-	cmask_zero(dst);
-	bpf_for(wi, 0, CMASK_MAX_WORDS) {
-		u64 word = 0;
-		if (wi >= nr_words)
-			break;
-		if (bpf_probe_read_kernel(&word, sizeof(u64), &src->bits[wi])) {
-			scx_bpf_error("probe-read cmask->bits[%u] failed", wi);
-			return;
-		}
-		dst->bits[wi] = word;
-	}
-}
-
 #endif /* __SCX_CID_BPF_H */
diff --git a/tools/sched_ext/scx_qmap.bpf.c b/tools/sched_ext/scx_qmap.bpf.c
index 7e77f22674ea..8a2d6a8ebd8e 100644
--- a/tools/sched_ext/scx_qmap.bpf.c
+++ b/tools/sched_ext/scx_qmap.bpf.c
@@ -919,14 +919,15 @@ void BPF_STRUCT_OPS(qmap_update_idle, s32 cid, bool idle)
 }
 
 void BPF_STRUCT_OPS(qmap_set_cmask, struct task_struct *p,
-		    const struct scx_cmask *cmask)
+		    const struct scx_cmask *cmask_in)
 {
+	struct scx_cmask __arena *cmask = (struct scx_cmask __arena *)(long)cmask_in;
 	task_ctx_t *taskc;
 
 	taskc = lookup_task_ctx(p);
 	if (!taskc)
 		return;
-	cmask_copy_from_kernel(&taskc->cpus_allowed, cmask);
+	cmask_copy(&taskc->cpus_allowed, cmask);
 }
 
 struct monitor_timer {
-- 
2.54.0



^ permalink raw reply related	[flat|nested] 18+ messages in thread

* [PATCHSET v3 sched_ext/for-7.2] bpf/arena: Direct kernel-side access
@ 2026-05-20 23:50 Tejun Heo
  2026-05-20 23:50 ` [PATCH 1/8] mm: Add ptep_try_set() for lockless empty-slot installs Tejun Heo
                   ` (7 more replies)
  0 siblings, 8 replies; 18+ messages in thread
From: Tejun Heo @ 2026-05-20 23:50 UTC (permalink / raw)
  To: David Vernet, Andrea Righi, Changwoo Min, Alexei Starovoitov,
	Andrii Nakryiko, Daniel Borkmann, Martin KaFai Lau,
	Kumar Kartikeya Dwivedi
  Cc: Peter Zijlstra, Catalin Marinas, Will Deacon, Thomas Gleixner,
	Ingo Molnar, Borislav Petkov, Dave Hansen, Andrew Morton,
	David Hildenbrand, Mike Rapoport, Emil Tsalapatis, sched-ext, bpf,
	x86, linux-arm-kernel, linux-mm, linux-kernel, Tejun Heo

Hello,

This makes BPF arena memory directly dereferenceable from kernel code
(struct_ops callbacks, kfuncs). Each arena gets a per-arena scratch page
that an arch fault hook installs into empty PTEs on kernel-side faults,
after KFENCE. The faulting instruction retries and the violation is reported
through the program's BPF stream.

v3:
- Patch 1: rename ptep_try_install() to ptep_try_set(). Tighten kerneldoc
  for kernel-PTE use. (David Hildenbrand, Alexei)
- Patch 2: apply_range_clear_cb() uses ptep_get_and_clear() so the install
  and clear sides race through atomic accessors. (David)

v2: https://lore.kernel.org/r/20260517211232.1670594-1-tj@kernel.org
v1 (RFC): https://lore.kernel.org/r/20260427105109.2554518-1-tj@kernel.org

Motivation
----------

sched_ext's ops_cid.set_cmask() hands the BPF scheduler a struct scx_cmask
*. The kernel translates a kernel cpumask to a cmask, but it had no way to
write into the arena, so the cmask lived in kernel memory and was passed as
a trusted pointer. BPF cmask helpers all operate on arena cmasks though, so
the BPF side had to word-by-word probe-read the kernel cmask into an arena
cmask via cmask_copy_from_kernel() before any helper could touch it. It
works, but is clumsy.

The shape isn't unique to set_cmask. Sub-scheduler support is on the way and
more sched_ext callbacks will want to pass structured data to BPF. Anywhere
a kfunc or struct_ops callback wants to hand a struct to a BPF program,
arena residence is the natural answer.

Approach
--------

Each arena gets a per-arena scratch page. Arenas stay sparsely mapped as
today - PTEs are populated only for allocated pages. A new arch fault hook
(bpf_arena_handle_page_fault) is wired into x86 page_fault_oops() and arm64
__do_kernel_fault(), after KFENCE. When a kernel-side access faults inside
an arena's kern_vm range, the helper walks the stack to find the BPF program
responsible, range-checks the fault address against prog->aux->arena, and
atomically installs the scratch page into the empty PTE via the new
ptep_try_set() wrapper. The kernel instruction retries and reads/writes the
scratch page. Free paths and map destruction treat scratch as non-owned.
Real allocation refuses to overwrite scratch (apply_range_set_cb returns
-EBUSY). A scratched address stays dead until map destroy, since its
presence means the BPF program has already malfunctioned.

The mechanism is default behavior - no UAPI flag.

What this preserves
-------------------

All the debugging properties of today's sparse-PTE design are preserved:

* BPF programs still fault on unmapped arena accesses. The fault semantics
  (instruction retry with rdst = 0) and the violation report through
  bpf_streams are unchanged for prog-side accesses.

* The first kernel-side touch of an unmapped address is reported via
  bpf_streams the same way as a prog-side fault, with the stack walk
  attributing it to the originating prog.

* User-side fault on a never-scratched address still lazy-allocates a real
  page (or returns SIGSEGV under BPF_F_SEGV_ON_FAULT). User-side fault on a
  scratched address SIGSEGVs.

What changes for the kernel-side caller is just that an unmapped deref no
longer oopses - it retries through the scratch page and emits a violation
report. The same shape today's BPF instruction faults have.

Patches 1-2 (atomic PTE install + arena scratch-page recovery)
--------------------------------------------------------------

  mm: Add ptep_try_set() for lockless empty-slot installs
  bpf: Recover arena kernel faults with scratch page

Patches 3-5 (helpers used by struct_ops registration)
-----------------------------------------------------

  bpf: Add sleepable variant of bpf_arena_alloc_pages for kernel callers
  bpf: Add bpf_struct_ops_for_each_prog()
  bpf/arena: Add bpf_arena_map_kern_vm_start() and bpf_prog_arena()

Patches 6-8 (sched_ext: arena auto-discovery, allocator, set_cmask)
-------------------------------------------------------------------

  sched_ext: Require an arena for cid-form schedulers
  sched_ext: Sub-allocator over kernel-claimed BPF arena pages
  sched_ext: Convert ops.set_cmask() to arena-resident cmask

Patch 6 reads each member prog's prog->aux->arena via bpf_prog_arena() and
requires the cid-form struct_ops to reference exactly one arena. Patch 7
builds a gen_pool sub-allocator inside that arena. Patch 8 converts
set_cmask() to write into arena memory; BPF dereferences via __arena like
any other arena struct, no probe-reads.

Base
----

sched_ext/for-7.2 (1136fb1213d1) with cmask-prep-v2.3 applied:
  https://lore.kernel.org/r/20260519075838.2706712-1-tj@kernel.org

Git tree: git://git.kernel.org/pub/scm/linux/kernel/git/tj/sched_ext.git arena-direct-v3

 Documentation/bpf/kfuncs.rst          |  14 +++
 arch/arm64/include/asm/pgtable.h      |   8 ++
 arch/arm64/mm/fault.c                 |  10 +-
 arch/x86/include/asm/pgtable.h        |   8 ++
 arch/x86/mm/fault.c                   |  12 +-
 include/linux/bpf.h                   |  14 +++
 include/linux/bpf_defs.h              |  11 ++
 include/linux/pgtable.h               |  26 ++++
 kernel/bpf/arena.c                    | 216 +++++++++++++++++++++++++++-------
 kernel/bpf/bpf_struct_ops.c           |  36 ++++++
 kernel/bpf/core.c                     |   5 +
 kernel/sched/build_policy.c           |   4 +
 kernel/sched/ext.c                    | 135 ++++++++++++++++++++-
 kernel/sched/ext_arena.c              | 127 ++++++++++++++++++++
 kernel/sched/ext_arena.h              |  18 +++
 kernel/sched/ext_cid.c                |  20 +---
 kernel/sched/ext_internal.h           |  23 +++-
 tools/sched_ext/include/scx/cid.bpf.h |  52 --------
 tools/sched_ext/scx_qmap.bpf.c        |   5 +-
 19 files changed, 616 insertions(+), 128 deletions(-)

Thanks.

--
tejun

^ permalink raw reply	[flat|nested] 18+ messages in thread

* [PATCH 1/8] mm: Add ptep_try_set() for lockless empty-slot installs
  2026-05-20 23:50 [PATCHSET v3 sched_ext/for-7.2] bpf/arena: Direct kernel-side access Tejun Heo
@ 2026-05-20 23:50 ` Tejun Heo
  2026-05-21  7:00   ` Andrea Righi
  2026-05-20 23:50 ` [PATCH 2/8] bpf: Recover arena kernel faults with scratch page Tejun Heo
                   ` (6 subsequent siblings)
  7 siblings, 1 reply; 18+ messages in thread
From: Tejun Heo @ 2026-05-20 23:50 UTC (permalink / raw)
  To: David Vernet, Andrea Righi, Changwoo Min, Alexei Starovoitov,
	Andrii Nakryiko, Daniel Borkmann, Martin KaFai Lau,
	Kumar Kartikeya Dwivedi
  Cc: Peter Zijlstra, Catalin Marinas, Will Deacon, Thomas Gleixner,
	Ingo Molnar, Borislav Petkov, Dave Hansen, Andrew Morton,
	David Hildenbrand, Mike Rapoport, Emil Tsalapatis, sched-ext, bpf,
	x86, linux-arm-kernel, linux-mm, linux-kernel, Tejun Heo

Add ptep_try_set(ptep, new_pte): atomically set *ptep to new_pte iff it is
currently pte_none(). Returns true on success, false if the slot was already
populated or the arch has no implementation.

The intended caller is the upcoming bpf_arena kernel-side fault recovery
path. The install runs from a page fault that can be nested under locks
held by the faulting kernel caller (e.g. a BPF program holding
raw_res_spin_lock_irqsave on its arena's spinlock), so trylock-and-retry
would A-A deadlock. Lock-free cmpxchg is the only viable option, which
constrains this helper to special kernel page tables where concurrent
writers cooperate via atomic accessors.

The generic version in <linux/pgtable.h> returns false. x86 and arm64
override with try_cmpxchg-based implementations on the underlying pteval.
Other architectures get the false stub - the callers there already fall
through to oops.

v2: Rename to ptep_try_set(). Tighten kerneldoc for kernel-PTE use.
    (David, Alexei)

Suggested-by: Kumar Kartikeya Dwivedi <memxor@gmail.com>
Suggested-by: Alexei Starovoitov <ast@kernel.org>
Signed-off-by: Tejun Heo <tj@kernel.org>
Cc: David Hildenbrand <david@kernel.org>
---
 arch/arm64/include/asm/pgtable.h |  8 ++++++++
 arch/x86/include/asm/pgtable.h   |  8 ++++++++
 include/linux/pgtable.h          | 26 ++++++++++++++++++++++++++
 3 files changed, 42 insertions(+)

diff --git a/arch/arm64/include/asm/pgtable.h b/arch/arm64/include/asm/pgtable.h
index 9029b81ccbe8..a129be91ef2c 100644
--- a/arch/arm64/include/asm/pgtable.h
+++ b/arch/arm64/include/asm/pgtable.h
@@ -1830,6 +1830,14 @@ static inline pte_t ptep_get_and_clear(struct mm_struct *mm,
 	return __ptep_get_and_clear(mm, addr, ptep);
 }
 
+static inline bool ptep_try_set(pte_t *ptep, pte_t new_pte)
+{
+	pteval_t old = 0;
+
+	return try_cmpxchg(&pte_val(*ptep), &old, pte_val(new_pte));
+}
+#define ptep_try_set ptep_try_set
+
 #define test_and_clear_young_ptes test_and_clear_young_ptes
 static inline bool test_and_clear_young_ptes(struct vm_area_struct *vma,
 		unsigned long addr, pte_t *ptep, unsigned int nr)
diff --git a/arch/x86/include/asm/pgtable.h b/arch/x86/include/asm/pgtable.h
index 13e3e9a054cb..047e273a4eab 100644
--- a/arch/x86/include/asm/pgtable.h
+++ b/arch/x86/include/asm/pgtable.h
@@ -1284,6 +1284,14 @@ static inline void ptep_set_wrprotect(struct mm_struct *mm,
 	} while (!try_cmpxchg((long *)&ptep->pte, (long *)&old_pte, *(long *)&new_pte));
 }
 
+static inline bool ptep_try_set(pte_t *ptep, pte_t new_pte)
+{
+	pte_t old_pte = __pte(0);
+
+	return try_cmpxchg((long *)&ptep->pte, (long *)&old_pte, *(long *)&new_pte);
+}
+#define ptep_try_set ptep_try_set
+
 #define flush_tlb_fix_spurious_fault(vma, address, ptep) do { } while (0)
 
 #define  __HAVE_ARCH_PMDP_SET_ACCESS_FLAGS
diff --git a/include/linux/pgtable.h b/include/linux/pgtable.h
index cdd68ed3ae1a..d68374f404c1 100644
--- a/include/linux/pgtable.h
+++ b/include/linux/pgtable.h
@@ -1036,6 +1036,32 @@ static inline void ptep_set_wrprotect(struct mm_struct *mm, unsigned long addres
 }
 #endif
 
+#ifndef ptep_try_set
+/**
+ * ptep_try_set - atomically set an empty kernel PTE
+ * @ptep: page table entry
+ * @new_pte: value to install
+ *
+ * Atomically set *@ptep to @new_pte iff *@ptep is pte_none(). Return
+ * true on success, false if the slot was already populated or the
+ * arch has no implementation.
+ *
+ * For special kernel page tables only - never user page tables. The
+ * caller must prevent concurrent teardown of @ptep and must accept
+ * that other writers may race. Concurrent clearers must use
+ * ptep_get_and_clear() so racing accesses agree on the outcome.
+ *
+ * Architectures opt in by providing a cmpxchg-based override and
+ * defining ptep_try_set as an identity macro. The generic stub
+ * returns false, which is correct for callers that fall through to
+ * oops on failure.
+ */
+static inline bool ptep_try_set(pte_t *ptep, pte_t new_pte)
+{
+	return false;
+}
+#endif
+
 #ifndef wrprotect_ptes
 /**
  * wrprotect_ptes - Write-protect PTEs that map consecutive pages of the same
-- 
2.54.0



^ permalink raw reply related	[flat|nested] 18+ messages in thread

* [PATCH 2/8] bpf: Recover arena kernel faults with scratch page
  2026-05-20 23:50 [PATCHSET v3 sched_ext/for-7.2] bpf/arena: Direct kernel-side access Tejun Heo
  2026-05-20 23:50 ` [PATCH 1/8] mm: Add ptep_try_set() for lockless empty-slot installs Tejun Heo
@ 2026-05-20 23:50 ` Tejun Heo
  2026-05-21  3:16   ` Emil Tsalapatis
  2026-05-20 23:50 ` [PATCH 3/8] bpf: Add sleepable variant of bpf_arena_alloc_pages for kernel callers Tejun Heo
                   ` (5 subsequent siblings)
  7 siblings, 1 reply; 18+ messages in thread
From: Tejun Heo @ 2026-05-20 23:50 UTC (permalink / raw)
  To: David Vernet, Andrea Righi, Changwoo Min, Alexei Starovoitov,
	Andrii Nakryiko, Daniel Borkmann, Martin KaFai Lau,
	Kumar Kartikeya Dwivedi
  Cc: Peter Zijlstra, Catalin Marinas, Will Deacon, Thomas Gleixner,
	Ingo Molnar, Borislav Petkov, Dave Hansen, Andrew Morton,
	David Hildenbrand, Mike Rapoport, Emil Tsalapatis, sched-ext, bpf,
	x86, linux-arm-kernel, linux-mm, linux-kernel, Tejun Heo

From: Kumar Kartikeya Dwivedi <memxor@gmail.com>

BPF arena usage is becoming more prevalent, but kernel <-> BPF communication
over arena memory is awkward today. Data has to be staged through a trusted
kernel pointer with extra code and copying on the BPF side. While reads
through arena pointers can use a fault-safe helper, writes don't have a good
solution. The in-line alternative would need instruction emulation or asm
fixup labels.

Enable direct kernel-side reads and writes within GUARD_SZ / 2 of any
handed-in arena pointer, without bounds checking. A per-arena scratch page
is installed by the arch fault path into empty arena kernel PTEs - x86 from
page_fault_oops() for not-present faults, arm64 from __do_kernel_fault() for
translation faults, both after the existing exception-table and KFENCE
handling. The faulting instruction retries and the access is also reported
through the program's BPF stream, preserving error reporting.

bpf_prog_find_from_stack() resolves the current BPF program (and its arena)
from the kernel stack - no new bpf_run_ctx state is added. Recovery covers
the 4 GiB arena plus the upper half-guard (GUARD_SZ / 2). The lower
half-guard is excluded because well-behaved kfuncs only access forward from
arena pointers. The kfunc-author contract - access at most GUARD_SZ / 2 past
a handed-in pointer - is documented in Documentation/bpf/kfuncs.rst.

The install is lock-free via ptep_try_set(). On race-loss the winning
installer's PTE is already valid, so the access retry succeeds. The arena
clear path uses ptep_get_and_clear() so installer and clearer race through
atomic accessors. No flush_tlb_kernel_range() afterwards. Stale "not mapped"
entries just cause one extra re-fault, cheaper than a global IPI on every
install.

Scratch exists only to keep the kernel from oopsing on an in-line arena
access. Its presence at a PTE means the BPF program has already
malfunctioned, and the violation is reported through the program's BPF
stream. The only requirement for behavior on a scratched PTE is that the
kernel doesn't crash. In particular, any user-side access through such a PTE
may segfault. The shared scratch page is freed once during map destruction.

BPF instruction faults continue to use the existing JIT exception-table
path. This patch changes only the kernel-text fault path. No UAPI flag is
added. The new behavior is the default.

v2: Use ptep_get_and_clear() in apply_range_clear_cb(). (David)

Suggested-by: Alexei Starovoitov <ast@kernel.org>
Signed-off-by: Kumar Kartikeya Dwivedi <memxor@gmail.com>
Signed-off-by: Tejun Heo <tj@kernel.org>
Cc: David Hildenbrand <david@kernel.org>
---
 Documentation/bpf/kfuncs.rst |  14 +++
 arch/arm64/mm/fault.c        |  10 +-
 arch/x86/mm/fault.c          |  12 ++-
 include/linux/bpf.h          |   1 +
 include/linux/bpf_defs.h     |  11 +++
 kernel/bpf/arena.c           | 177 +++++++++++++++++++++++++++--------
 kernel/bpf/core.c            |   5 +
 7 files changed, 183 insertions(+), 47 deletions(-)
 create mode 100644 include/linux/bpf_defs.h

diff --git a/Documentation/bpf/kfuncs.rst b/Documentation/bpf/kfuncs.rst
index 75e6c078e0e7..6d497e720998 100644
--- a/Documentation/bpf/kfuncs.rst
+++ b/Documentation/bpf/kfuncs.rst
@@ -462,6 +462,20 @@ In order to accommodate such requirements, the verifier will enforce strict
 PTR_TO_BTF_ID type matching if two types have the exact same name, with one
 being suffixed with ``___init``.
 
+2.8 Accessing arena memory through kfunc arguments
+--------------------------------------------------
+
+A read or write at any address inside an arena does not oops the kernel.
+Unallocated arena pages are lazily backed by a scratch page and the
+access is reported through the program's BPF stream as an error. Only
+the BPF program's correctness is affected; the kernel itself remains
+intact.
+
+The arena is followed by a ``GUARD_SZ / 2`` (32 KiB) guard region that
+is also covered by this recovery. A kfunc handed an arena pointer may
+therefore access up to ``GUARD_SZ / 2`` past it without bounds-checking
+against the arena. Larger accesses must verify the range explicitly.
+
 .. _BPF_kfunc_lifecycle_expectations:
 
 3. kfunc lifecycle expectations
diff --git a/arch/arm64/mm/fault.c b/arch/arm64/mm/fault.c
index 920a8b244d59..0d58d667fcd8 100644
--- a/arch/arm64/mm/fault.c
+++ b/arch/arm64/mm/fault.c
@@ -9,6 +9,7 @@
 
 #include <linux/acpi.h>
 #include <linux/bitfield.h>
+#include <linux/bpf_defs.h>
 #include <linux/extable.h>
 #include <linux/kfence.h>
 #include <linux/signal.h>
@@ -416,9 +417,12 @@ static void __do_kernel_fault(unsigned long addr, unsigned long esr,
 	} else if (addr < PAGE_SIZE) {
 		msg = "NULL pointer dereference";
 	} else {
-		if (esr_fsc_is_translation_fault(esr) &&
-		    kfence_handle_page_fault(addr, esr & ESR_ELx_WNR, regs))
-			return;
+		if (esr_fsc_is_translation_fault(esr)) {
+			if (kfence_handle_page_fault(addr, esr & ESR_ELx_WNR, regs))
+				return;
+			if (bpf_arena_handle_page_fault(addr, esr & ESR_ELx_WNR, regs->pc))
+				return;
+		}
 
 		msg = "paging request";
 	}
diff --git a/arch/x86/mm/fault.c b/arch/x86/mm/fault.c
index f0e77e084482..b0f103ddbd23 100644
--- a/arch/x86/mm/fault.c
+++ b/arch/x86/mm/fault.c
@@ -8,6 +8,7 @@
 #include <linux/sched/task_stack.h>	/* task_stack_*(), ...		*/
 #include <linux/kdebug.h>		/* oops_begin/end, ...		*/
 #include <linux/memblock.h>		/* max_low_pfn			*/
+#include <linux/bpf_defs.h>		/* bpf_arena_handle_page_fault	*/
 #include <linux/kfence.h>		/* kfence_handle_page_fault	*/
 #include <linux/kprobes.h>		/* NOKPROBE_SYMBOL, ...		*/
 #include <linux/mmiotrace.h>		/* kmmio_handler, ...		*/
@@ -688,10 +689,13 @@ page_fault_oops(struct pt_regs *regs, unsigned long error_code,
 	if (IS_ENABLED(CONFIG_EFI))
 		efi_crash_gracefully_on_page_fault(address);
 
-	/* Only not-present faults should be handled by KFENCE. */
-	if (!(error_code & X86_PF_PROT) &&
-	    kfence_handle_page_fault(address, error_code & X86_PF_WRITE, regs))
-		return;
+	/* Only not-present faults should be handled by KFENCE or BPF arena. */
+	if (!(error_code & X86_PF_PROT)) {
+		if (kfence_handle_page_fault(address, error_code & X86_PF_WRITE, regs))
+			return;
+		if (bpf_arena_handle_page_fault(address, error_code & X86_PF_WRITE, regs->ip))
+			return;
+	}
 
 oops:
 	/*
diff --git a/include/linux/bpf.h b/include/linux/bpf.h
index 0136a108d083..831996c411cf 100644
--- a/include/linux/bpf.h
+++ b/include/linux/bpf.h
@@ -6,6 +6,7 @@
 
 #include <uapi/linux/bpf.h>
 #include <uapi/linux/filter.h>
+#include <linux/bpf_defs.h>
 
 #include <crypto/sha2.h>
 #include <linux/workqueue.h>
diff --git a/include/linux/bpf_defs.h b/include/linux/bpf_defs.h
new file mode 100644
index 000000000000..d98e033b8c0b
--- /dev/null
+++ b/include/linux/bpf_defs.h
@@ -0,0 +1,11 @@
+/* SPDX-License-Identifier: GPL-2.0-or-later */
+/*
+ * Subset of bpf.h declarations, split out so files that need only these
+ * declarations can avoid bpf.h's full include cost.
+ */
+#ifndef _LINUX_BPF_DEFS_H
+#define _LINUX_BPF_DEFS_H
+
+bool bpf_arena_handle_page_fault(unsigned long addr, bool is_write, unsigned long fault_ip);
+
+#endif /* _LINUX_BPF_DEFS_H */
diff --git a/kernel/bpf/arena.c b/kernel/bpf/arena.c
index 08d008cc471e..1c0b87ecc817 100644
--- a/kernel/bpf/arena.c
+++ b/kernel/bpf/arena.c
@@ -53,6 +53,7 @@ struct bpf_arena {
 	u64 user_vm_start;
 	u64 user_vm_end;
 	struct vm_struct *kern_vm;
+	struct page *scratch_page;
 	struct range_tree rt;
 	/* protects rt */
 	rqspinlock_t spinlock;
@@ -118,6 +119,11 @@ struct apply_range_data {
 	int i;
 };
 
+struct clear_range_data {
+	struct llist_head *free_pages;
+	struct page *scratch_page;
+};
+
 static int apply_range_set_cb(pte_t *pte, unsigned long addr, void *data)
 {
 	struct apply_range_data *d = data;
@@ -144,33 +150,59 @@ static void flush_vmap_cache(unsigned long start, unsigned long size)
 	flush_cache_vmap(start, start + size);
 }
 
-static int apply_range_clear_cb(pte_t *pte, unsigned long addr, void *free_pages)
+static int apply_range_clear_cb(pte_t *pte, unsigned long addr, void *data)
 {
+	struct clear_range_data *d = data;
 	pte_t old_pte;
 	struct page *page;
 
-	/* sanity check */
-	old_pte = ptep_get(pte);
+	/*
+	 * Pairs with ptep_try_set() in the kernel-fault scratch installer.
+	 * Both sides must be atomic.
+	 */
+	old_pte = ptep_get_and_clear(&init_mm, addr, pte);
 	if (pte_none(old_pte) || !pte_present(old_pte))
-		return 0; /* nothing to do */
+		return 0;
 
 	page = pte_page(old_pte);
 	if (WARN_ON_ONCE(!page))
 		return -EINVAL;
 
-	pte_clear(&init_mm, addr, pte);
+	/*
+	 * Skip the per-arena scratch page. A kernel fault on an unallocated uaddr
+	 * scratches its PTE. A later bpf_arena_free_pages() over that range walks
+	 * here. Without the skip, scratch_page would be freed.
+	 */
+	if (page == d->scratch_page)
+		return 0;
+
+	__llist_add(&page->pcp_llist, d->free_pages);
+	return 0;
+}
 
-	/* Add page to the list so it is freed later */
-	if (free_pages)
-		__llist_add(&page->pcp_llist, free_pages);
+static int apply_range_set_scratch_cb(pte_t *pte, unsigned long addr, void *data)
+{
+	struct page *scratch_page = data;
 
+	if (!pte_none(ptep_get(pte)))
+		return 0;
+	/*
+	 * Best-effort install. ptep_try_set() returns false only if another
+	 * installer (real allocation or concurrent fault) won the cmpxchg.
+	 * Their PTE is already valid, so the access retry succeeds.
+	 *
+	 * No flush_tlb_kernel_range() needed. Stale "not mapped" entries just
+	 * cause one extra re-fault through this same path.
+	 */
+	ptep_try_set(pte, mk_pte(scratch_page, PAGE_KERNEL));
 	return 0;
 }
 
 static int populate_pgtable_except_pte(struct bpf_arena *arena)
 {
+	/* Populate intermediates for the recovery range (4 GiB + upper half-guard). */
 	return apply_to_page_range(&init_mm, bpf_arena_get_kern_vm_start(arena),
-				   KERN_VM_SZ - GUARD_SZ, apply_range_set_cb, NULL);
+				   SZ_4G + GUARD_SZ / 2, apply_range_set_cb, NULL);
 }
 
 static struct bpf_map *arena_map_alloc(union bpf_attr *attr)
@@ -221,22 +253,29 @@ static struct bpf_map *arena_map_alloc(union bpf_attr *attr)
 	init_irq_work(&arena->free_irq, arena_free_irq);
 	INIT_WORK(&arena->free_work, arena_free_worker);
 	bpf_map_init_from_attr(&arena->map, attr);
+
+	err = bpf_map_alloc_pages(&arena->map, NUMA_NO_NODE, 1, &arena->scratch_page);
+	if (err)
+		goto err_free_arena;
+
 	range_tree_init(&arena->rt);
 	err = range_tree_set(&arena->rt, 0, attr->max_entries);
-	if (err) {
-		bpf_map_area_free(arena);
-		goto err;
-	}
+	if (err)
+		goto err_free_scratch;
 	mutex_init(&arena->lock);
 	raw_res_spin_lock_init(&arena->spinlock);
 	err = populate_pgtable_except_pte(arena);
-	if (err) {
-		range_tree_destroy(&arena->rt);
-		bpf_map_area_free(arena);
-		goto err;
-	}
+	if (err)
+		goto err_destroy_rt;
 
 	return &arena->map;
+
+err_destroy_rt:
+	range_tree_destroy(&arena->rt);
+err_free_scratch:
+	__free_page(arena->scratch_page);
+err_free_arena:
+	bpf_map_area_free(arena);
 err:
 	free_vm_area(kern_vm);
 	return ERR_PTR(err);
@@ -244,6 +283,7 @@ static struct bpf_map *arena_map_alloc(union bpf_attr *attr)
 
 static int existing_page_cb(pte_t *ptep, unsigned long addr, void *data)
 {
+	struct bpf_arena *arena = data;
 	struct page *page;
 	pte_t pte;
 
@@ -251,6 +291,12 @@ static int existing_page_cb(pte_t *ptep, unsigned long addr, void *data)
 	if (!pte_present(pte)) /* sanity check */
 		return 0;
 	page = pte_page(pte);
+	/*
+	 * Skip the scratch page. The walk is page-table-driven, not range-tree-driven,
+	 * so it can visit scratch PTEs at uaddrs the BPF program never allocated.
+	 */
+	if (page == arena->scratch_page)
+		return 0;
 	/*
 	 * We do not update pte here:
 	 * 1. Nobody should be accessing bpf_arena's range outside of a kernel bug
@@ -286,9 +332,10 @@ static void arena_map_free(struct bpf_map *map)
 	 * free those pages.
 	 */
 	apply_to_existing_page_range(&init_mm, bpf_arena_get_kern_vm_start(arena),
-				     KERN_VM_SZ - GUARD_SZ, existing_page_cb, NULL);
+				     SZ_4G + GUARD_SZ / 2, existing_page_cb, arena);
 	free_vm_area(arena->kern_vm);
 	range_tree_destroy(&arena->rt);
+	__free_page(arena->scratch_page);
 	bpf_map_area_free(arena);
 }
 
@@ -374,33 +421,37 @@ static vm_fault_t arena_vm_fault(struct vm_fault *vmf)
 		return VM_FAULT_RETRY;
 
 	page = vmalloc_to_page((void *)kaddr);
-	if (page)
+	if (page) {
+		if (page == arena->scratch_page)
+			/* BPF triggered scratch here; don't lazy-alloc over it */
+			goto out_sigsegv;
 		/* already have a page vmap-ed */
 		goto out;
+	}
 
 	bpf_map_memcg_enter(&arena->map, &old_memcg, &new_memcg);
 
 	if (arena->map.map_flags & BPF_F_SEGV_ON_FAULT)
 		/* User space requested to segfault when page is not allocated by bpf prog */
-		goto out_unlock_sigsegv;
+		goto out_sigsegv_memcg;
 
 	ret = range_tree_clear(&arena->rt, vmf->pgoff, 1);
 	if (ret)
-		goto out_unlock_sigsegv;
+		goto out_sigsegv_memcg;
 
 	struct apply_range_data data = { .pages = &page, .i = 0 };
 	/* Account into memcg of the process that created bpf_arena */
 	ret = bpf_map_alloc_pages(map, NUMA_NO_NODE, 1, &page);
 	if (ret) {
 		range_tree_set(&arena->rt, vmf->pgoff, 1);
-		goto out_unlock_sigsegv;
+		goto out_sigsegv_memcg;
 	}
 
 	ret = apply_to_page_range(&init_mm, kaddr, PAGE_SIZE, apply_range_set_cb, &data);
 	if (ret) {
 		range_tree_set(&arena->rt, vmf->pgoff, 1);
 		free_pages_nolock(page, 0);
-		goto out_unlock_sigsegv;
+		goto out_sigsegv_memcg;
 	}
 	flush_vmap_cache(kaddr, PAGE_SIZE);
 	bpf_map_memcg_exit(old_memcg, new_memcg);
@@ -409,8 +460,9 @@ static vm_fault_t arena_vm_fault(struct vm_fault *vmf)
 	raw_res_spin_unlock_irqrestore(&arena->spinlock, flags);
 	vmf->page = page;
 	return 0;
-out_unlock_sigsegv:
+out_sigsegv_memcg:
 	bpf_map_memcg_exit(old_memcg, new_memcg);
+out_sigsegv:
 	raw_res_spin_unlock_irqrestore(&arena->spinlock, flags);
 	return VM_FAULT_SIGSEGV;
 }
@@ -668,6 +720,7 @@ static void arena_free_pages(struct bpf_arena *arena, long uaddr, long page_cnt,
 	struct llist_head free_pages;
 	struct llist_node *pos, *t;
 	struct arena_free_span *s;
+	struct clear_range_data cdata;
 	unsigned long flags;
 	int ret = 0;
 
@@ -696,9 +749,11 @@ static void arena_free_pages(struct bpf_arena *arena, long uaddr, long page_cnt,
 	range_tree_set(&arena->rt, pgoff, page_cnt);
 
 	init_llist_head(&free_pages);
+	cdata.free_pages = &free_pages;
+	cdata.scratch_page = arena->scratch_page;
 	/* clear ptes and collect struct pages */
 	apply_to_existing_page_range(&init_mm, kaddr, page_cnt << PAGE_SHIFT,
-				     apply_range_clear_cb, &free_pages);
+				     apply_range_clear_cb, &cdata);
 
 	/* drop the lock to do the tlb flush and zap pages */
 	raw_res_spin_unlock_irqrestore(&arena->spinlock, flags);
@@ -788,6 +843,7 @@ static void arena_free_worker(struct work_struct *work)
 	struct arena_free_span *s;
 	u64 arena_vm_start, user_vm_start;
 	struct llist_head free_pages;
+	struct clear_range_data cdata;
 	struct page *page;
 	unsigned long full_uaddr;
 	long kaddr, page_cnt, pgoff;
@@ -801,6 +857,8 @@ static void arena_free_worker(struct work_struct *work)
 	bpf_map_memcg_enter(&arena->map, &old_memcg, &new_memcg);
 
 	init_llist_head(&free_pages);
+	cdata.free_pages = &free_pages;
+	cdata.scratch_page = arena->scratch_page;
 	arena_vm_start = bpf_arena_get_kern_vm_start(arena);
 	user_vm_start = bpf_arena_get_user_vm_start(arena);
 
@@ -813,7 +871,7 @@ static void arena_free_worker(struct work_struct *work)
 
 		/* clear ptes and collect pages in free_pages llist */
 		apply_to_existing_page_range(&init_mm, kaddr, page_cnt << PAGE_SHIFT,
-					     apply_range_clear_cb, &free_pages);
+					     apply_range_clear_cb, &cdata);
 
 		range_tree_set(&arena->rt, pgoff, page_cnt);
 	}
@@ -928,23 +986,12 @@ static int __init kfunc_init(void)
 }
 late_initcall(kfunc_init);
 
-void bpf_prog_report_arena_violation(bool write, unsigned long addr, unsigned long fault_ip)
+static void __bpf_prog_report_arena_violation(struct bpf_prog *prog, bool write,
+					      unsigned long addr, unsigned long fault_ip)
 {
 	struct bpf_stream_stage ss;
-	struct bpf_prog *prog;
 	u64 user_vm_start;
 
-	/*
-	 * The RCU read lock is held to safely traverse the latch tree, but we
-	 * don't need its protection when accessing the prog, since it will not
-	 * disappear while we are handling the fault.
-	 */
-	rcu_read_lock();
-	prog = bpf_prog_ksym_find(fault_ip);
-	rcu_read_unlock();
-	if (!prog)
-		return;
-
 	/* Use main prog for stream access */
 	prog = prog->aux->main_prog_aux->prog;
 
@@ -957,3 +1004,53 @@ void bpf_prog_report_arena_violation(bool write, unsigned long addr, unsigned lo
 		bpf_stream_dump_stack(ss);
 	}));
 }
+
+bool bpf_arena_handle_page_fault(unsigned long addr, bool is_write, unsigned long fault_ip)
+{
+	struct bpf_arena *arena;
+	struct bpf_prog *prog;
+	unsigned long kbase;
+	unsigned long page_addr = addr & PAGE_MASK;
+
+	prog = bpf_prog_find_from_stack();
+	if (!prog)
+		return false;
+
+	arena = prog->aux->arena;
+	/* a prog not using arena may be on stack, so arena can be NULL */
+	if (!arena)
+		return false;
+
+	kbase = bpf_arena_get_kern_vm_start(arena);
+
+	/*
+	 * Recovery covers the 4 GiB mappable band plus the upper half-guard.
+	 * Lower guard is unreachable from kfuncs; an address there indicates
+	 * a different bug class - leave it to the regular kernel oops path.
+	 */
+	if (page_addr < kbase || page_addr >= kbase + SZ_4G + GUARD_SZ / 2)
+		return false;
+
+	apply_to_page_range(&init_mm, page_addr, PAGE_SIZE,
+			    apply_range_set_scratch_cb, arena->scratch_page);
+	flush_vmap_cache(page_addr, PAGE_SIZE);
+	__bpf_prog_report_arena_violation(prog, is_write, page_addr - kbase, fault_ip);
+	return true;
+}
+
+void bpf_prog_report_arena_violation(bool write, unsigned long addr, unsigned long fault_ip)
+{
+	struct bpf_prog *prog;
+
+	/*
+	 * The RCU read lock is held to safely traverse the latch tree, but we
+	 * don't need its protection when accessing the prog, since it will not
+	 * disappear while we are handling the fault.
+	 */
+	rcu_read_lock();
+	prog = bpf_prog_ksym_find(fault_ip);
+	rcu_read_unlock();
+	if (!prog)
+		return;
+	__bpf_prog_report_arena_violation(prog, write, addr, fault_ip);
+}
diff --git a/kernel/bpf/core.c b/kernel/bpf/core.c
index 066b86e7233c..fa368d8920d9 100644
--- a/kernel/bpf/core.c
+++ b/kernel/bpf/core.c
@@ -3290,6 +3290,11 @@ __weak u64 bpf_arena_get_kern_vm_start(struct bpf_arena *arena)
 {
 	return 0;
 }
+__weak bool bpf_arena_handle_page_fault(unsigned long addr, bool is_write,
+					unsigned long fault_ip)
+{
+	return false;
+}
 
 #ifdef CONFIG_BPF_SYSCALL
 static int __init bpf_global_ma_init(void)
-- 
2.54.0



^ permalink raw reply related	[flat|nested] 18+ messages in thread

* [PATCH 3/8] bpf: Add sleepable variant of bpf_arena_alloc_pages for kernel callers
  2026-05-20 23:50 [PATCHSET v3 sched_ext/for-7.2] bpf/arena: Direct kernel-side access Tejun Heo
  2026-05-20 23:50 ` [PATCH 1/8] mm: Add ptep_try_set() for lockless empty-slot installs Tejun Heo
  2026-05-20 23:50 ` [PATCH 2/8] bpf: Recover arena kernel faults with scratch page Tejun Heo
@ 2026-05-20 23:50 ` Tejun Heo
  2026-05-21  3:17   ` Emil Tsalapatis
  2026-05-20 23:50 ` [PATCH 4/8] bpf: Add bpf_struct_ops_for_each_prog() Tejun Heo
                   ` (4 subsequent siblings)
  7 siblings, 1 reply; 18+ messages in thread
From: Tejun Heo @ 2026-05-20 23:50 UTC (permalink / raw)
  To: David Vernet, Andrea Righi, Changwoo Min, Alexei Starovoitov,
	Andrii Nakryiko, Daniel Borkmann, Martin KaFai Lau,
	Kumar Kartikeya Dwivedi
  Cc: Peter Zijlstra, Catalin Marinas, Will Deacon, Thomas Gleixner,
	Ingo Molnar, Borislav Petkov, Dave Hansen, Andrew Morton,
	David Hildenbrand, Mike Rapoport, Emil Tsalapatis, sched-ext, bpf,
	x86, linux-arm-kernel, linux-mm, linux-kernel, Tejun Heo

The existing kernel-side export of bpf_arena_alloc_pages is _non_sleepable
only - it's used by the verifier to inline the kfunc when the call site is
non-sleepable. There is no sleepable equivalent for kernel callers; the
kfunc bpf_arena_alloc_pages itself is BPF-only.

sched_ext needs sleepable kernel-side allocs for its arena pool init/grow
paths. Add bpf_arena_alloc_pages_sleepable() mirroring the _non_sleepable
wrapper but passing sleepable=true to arena_alloc_pages().

Signed-off-by: Tejun Heo <tj@kernel.org>
---
 include/linux/bpf.h |  8 ++++++++
 kernel/bpf/arena.c  | 13 +++++++++++++
 2 files changed, 21 insertions(+)

diff --git a/include/linux/bpf.h b/include/linux/bpf.h
index 831996c411cf..64968ca6db51 100644
--- a/include/linux/bpf.h
+++ b/include/linux/bpf.h
@@ -679,6 +679,8 @@ int bpf_dynptr_from_file_sleepable(struct file *file, u32 flags,
 void *bpf_arena_alloc_pages_non_sleepable(void *p__map, void *addr__ign, u32 page_cnt, int node_id,
 					  u64 flags);
 void bpf_arena_free_pages_non_sleepable(void *p__map, void *ptr__ign, u32 page_cnt);
+void *bpf_arena_alloc_pages_sleepable(void *p__map, void *addr__ign, u32 page_cnt, int node_id,
+				      u64 flags);
 #else
 static inline void *bpf_arena_alloc_pages_non_sleepable(void *p__map, void *addr__ign, u32 page_cnt,
 							int node_id, u64 flags)
@@ -689,6 +691,12 @@ static inline void *bpf_arena_alloc_pages_non_sleepable(void *p__map, void *addr
 static inline void bpf_arena_free_pages_non_sleepable(void *p__map, void *ptr__ign, u32 page_cnt)
 {
 }
+
+static inline void *bpf_arena_alloc_pages_sleepable(void *p__map, void *addr__ign, u32 page_cnt,
+						    int node_id, u64 flags)
+{
+	return NULL;
+}
 #endif
 
 extern const struct bpf_map_ops bpf_map_offload_ops;
diff --git a/kernel/bpf/arena.c b/kernel/bpf/arena.c
index 1c0b87ecc817..a811cf6170fa 100644
--- a/kernel/bpf/arena.c
+++ b/kernel/bpf/arena.c
@@ -934,6 +934,19 @@ void *bpf_arena_alloc_pages_non_sleepable(void *p__map, void *addr__ign, u32 pag
 
 	return (void *)arena_alloc_pages(arena, (long)addr__ign, page_cnt, node_id, false);
 }
+
+void *bpf_arena_alloc_pages_sleepable(void *p__map, void *addr__ign, u32 page_cnt,
+				      int node_id, u64 flags)
+{
+	struct bpf_map *map = p__map;
+	struct bpf_arena *arena = container_of(map, struct bpf_arena, map);
+
+	if (map->map_type != BPF_MAP_TYPE_ARENA || flags || !page_cnt)
+		return NULL;
+
+	return (void *)arena_alloc_pages(arena, (long)addr__ign, page_cnt, node_id, true);
+}
+
 __bpf_kfunc void bpf_arena_free_pages(void *p__map, void *ptr__ign, u32 page_cnt)
 {
 	struct bpf_map *map = p__map;
-- 
2.54.0



^ permalink raw reply related	[flat|nested] 18+ messages in thread

* [PATCH 4/8] bpf: Add bpf_struct_ops_for_each_prog()
  2026-05-20 23:50 [PATCHSET v3 sched_ext/for-7.2] bpf/arena: Direct kernel-side access Tejun Heo
                   ` (2 preceding siblings ...)
  2026-05-20 23:50 ` [PATCH 3/8] bpf: Add sleepable variant of bpf_arena_alloc_pages for kernel callers Tejun Heo
@ 2026-05-20 23:50 ` Tejun Heo
  2026-05-21  4:07   ` Emil Tsalapatis
  2026-05-20 23:50 ` [PATCH 5/8] bpf/arena: Add bpf_arena_map_kern_vm_start() and bpf_prog_arena() Tejun Heo
                   ` (3 subsequent siblings)
  7 siblings, 1 reply; 18+ messages in thread
From: Tejun Heo @ 2026-05-20 23:50 UTC (permalink / raw)
  To: David Vernet, Andrea Righi, Changwoo Min, Alexei Starovoitov,
	Andrii Nakryiko, Daniel Borkmann, Martin KaFai Lau,
	Kumar Kartikeya Dwivedi
  Cc: Peter Zijlstra, Catalin Marinas, Will Deacon, Thomas Gleixner,
	Ingo Molnar, Borislav Petkov, Dave Hansen, Andrew Morton,
	David Hildenbrand, Mike Rapoport, Emil Tsalapatis, sched-ext, bpf,
	x86, linux-arm-kernel, linux-mm, linux-kernel, Tejun Heo

Add a helper that walks the member progs of the struct_ops map
containing a given @kdata vmtable. struct_ops ->reg() callbacks (and
similar) sometimes need to inspect the loaded BPF programs, e.g. to
discover maps they reference via prog->aux->used_maps.

The implementation mirrors bpf_struct_ops_id(): container_of @kdata
to recover the bpf_struct_ops_map, then iterate st_map->links[i]->prog
for i in [0, funcs_cnt). Same access pattern, no new locking - by the
time ->reg() fires st_map is fully populated and stable.

A sched_ext follow-up walks the member progs of a cid-form scheduler's
struct_ops map, reads prog->aux->arena directly, and requires all member
progs to reference exactly one arena, without requiring the BPF program
to call a registration kfunc.

Signed-off-by: Tejun Heo <tj@kernel.org>
---
 include/linux/bpf.h         |  3 +++
 kernel/bpf/bpf_struct_ops.c | 36 ++++++++++++++++++++++++++++++++++++
 2 files changed, 39 insertions(+)

diff --git a/include/linux/bpf.h b/include/linux/bpf.h
index 64968ca6db51..5b99d786e98c 100644
--- a/include/linux/bpf.h
+++ b/include/linux/bpf.h
@@ -2129,6 +2129,9 @@ int bpf_prog_assoc_struct_ops(struct bpf_prog *prog, struct bpf_map *map);
 void bpf_prog_disassoc_struct_ops(struct bpf_prog *prog);
 void *bpf_prog_get_assoc_struct_ops(const struct bpf_prog_aux *aux);
 u32 bpf_struct_ops_id(const void *kdata);
+int bpf_struct_ops_for_each_prog(const void *kdata,
+				 int (*cb)(struct bpf_prog *prog, void *data),
+				 void *data);
 
 #ifdef CONFIG_NET
 /* Define it here to avoid the use of forward declaration */
diff --git a/kernel/bpf/bpf_struct_ops.c b/kernel/bpf/bpf_struct_ops.c
index 05b366b821c3..16aec18ed31b 100644
--- a/kernel/bpf/bpf_struct_ops.c
+++ b/kernel/bpf/bpf_struct_ops.c
@@ -1203,6 +1203,42 @@ u32 bpf_struct_ops_id(const void *kdata)
 }
 EXPORT_SYMBOL_GPL(bpf_struct_ops_id);
 
+/**
+ * bpf_struct_ops_for_each_prog - Invoke @cb for each member prog
+ * @kdata: kernel-side struct_ops vmtable (the @kdata arg to ->reg/->update/->unreg)
+ * @cb: callback invoked once per member prog; non-zero return stops iteration
+ * @data: opaque argument passed to @cb
+ *
+ * Walks the struct_ops member progs registered on the map containing @kdata.
+ * Intended for use from struct_ops ->reg() callbacks (and similar) that need to
+ * inspect the loaded BPF programs (for example to discover maps they reference
+ * via @prog->aux->used_maps).
+ *
+ * Return 0 if iteration completed, otherwise the first non-zero @cb return.
+ */
+int bpf_struct_ops_for_each_prog(const void *kdata,
+				 int (*cb)(struct bpf_prog *prog, void *data),
+				 void *data)
+{
+	struct bpf_struct_ops_value *kvalue;
+	struct bpf_struct_ops_map *st_map;
+	u32 i;
+	int ret;
+
+	kvalue = container_of(kdata, struct bpf_struct_ops_value, data);
+	st_map = container_of(kvalue, struct bpf_struct_ops_map, kvalue);
+
+	for (i = 0; i < st_map->funcs_cnt; i++) {
+		if (!st_map->links[i])
+			continue;
+		ret = cb(st_map->links[i]->prog, data);
+		if (ret)
+			return ret;
+	}
+	return 0;
+}
+EXPORT_SYMBOL_GPL(bpf_struct_ops_for_each_prog);
+
 static bool bpf_struct_ops_valid_to_reg(struct bpf_map *map)
 {
 	struct bpf_struct_ops_map *st_map = (struct bpf_struct_ops_map *)map;
-- 
2.54.0



^ permalink raw reply related	[flat|nested] 18+ messages in thread

* [PATCH 5/8] bpf/arena: Add bpf_arena_map_kern_vm_start() and bpf_prog_arena()
  2026-05-20 23:50 [PATCHSET v3 sched_ext/for-7.2] bpf/arena: Direct kernel-side access Tejun Heo
                   ` (3 preceding siblings ...)
  2026-05-20 23:50 ` [PATCH 4/8] bpf: Add bpf_struct_ops_for_each_prog() Tejun Heo
@ 2026-05-20 23:50 ` Tejun Heo
  2026-05-21  4:08   ` Emil Tsalapatis
  2026-05-20 23:50 ` [PATCH 6/8] sched_ext: Require an arena for cid-form schedulers Tejun Heo
                   ` (2 subsequent siblings)
  7 siblings, 1 reply; 18+ messages in thread
From: Tejun Heo @ 2026-05-20 23:50 UTC (permalink / raw)
  To: David Vernet, Andrea Righi, Changwoo Min, Alexei Starovoitov,
	Andrii Nakryiko, Daniel Borkmann, Martin KaFai Lau,
	Kumar Kartikeya Dwivedi
  Cc: Peter Zijlstra, Catalin Marinas, Will Deacon, Thomas Gleixner,
	Ingo Molnar, Borislav Petkov, Dave Hansen, Andrew Morton,
	David Hildenbrand, Mike Rapoport, Emil Tsalapatis, sched-ext, bpf,
	x86, linux-arm-kernel, linux-mm, linux-kernel, Tejun Heo

struct bpf_arena is opaque to callers outside arena.c. Add two helpers
for struct_ops subsystems that need to reach into an arena:

  bpf_arena_map_kern_vm_start(struct bpf_map *map)
    returns @map's kern_vm_start. A sched_ext follow-up needs this
    to translate kern_va <-> uaddr.

  bpf_prog_arena(struct bpf_prog *prog)
    returns the bpf_map of the arena referenced by @prog (NULL if
    @prog references no arena). The verifier enforces at most one
    arena per program. Used by struct_ops callers that auto-discover
    an arena from a member prog and need to take a map reference.

Suggested-by: Kumar Kartikeya Dwivedi <memxor@gmail.com>
Signed-off-by: Tejun Heo <tj@kernel.org>
---
 include/linux/bpf.h |  2 ++
 kernel/bpf/arena.c  | 26 ++++++++++++++++++++++++++
 2 files changed, 28 insertions(+)

diff --git a/include/linux/bpf.h b/include/linux/bpf.h
index 5b99d786e98c..e1ba57c10aaa 100644
--- a/include/linux/bpf.h
+++ b/include/linux/bpf.h
@@ -618,6 +618,8 @@ void bpf_rb_root_free(const struct btf_field *field, void *rb_root,
 		      struct bpf_spin_lock *spin_lock);
 u64 bpf_arena_get_kern_vm_start(struct bpf_arena *arena);
 u64 bpf_arena_get_user_vm_start(struct bpf_arena *arena);
+u64 bpf_arena_map_kern_vm_start(struct bpf_map *map);
+struct bpf_map *bpf_prog_arena(struct bpf_prog *prog);
 int bpf_obj_name_cpy(char *dst, const char *src, unsigned int size);
 
 struct bpf_offload_dev;
diff --git a/kernel/bpf/arena.c b/kernel/bpf/arena.c
index a811cf6170fa..51b9ae36feb6 100644
--- a/kernel/bpf/arena.c
+++ b/kernel/bpf/arena.c
@@ -84,6 +84,32 @@ u64 bpf_arena_get_user_vm_start(struct bpf_arena *arena)
 	return arena ? arena->user_vm_start : 0;
 }
 
+/**
+ * bpf_arena_map_kern_vm_start - kern_vm_start lookup by struct bpf_map *
+ * @map: a BPF_MAP_TYPE_ARENA map
+ *
+ * Return @map's kern_vm_start.
+ */
+u64 bpf_arena_map_kern_vm_start(struct bpf_map *map)
+{
+	return bpf_arena_get_kern_vm_start(container_of(map, struct bpf_arena, map));
+}
+
+/**
+ * bpf_prog_arena - return the bpf_map of the arena referenced by @prog
+ * @prog: a loaded BPF program
+ *
+ * The verifier enforces at most one arena per program and stores it in
+ * prog->aux->arena. Return that arena's underlying bpf_map, or NULL if
+ * @prog does not reference an arena.
+ */
+struct bpf_map *bpf_prog_arena(struct bpf_prog *prog)
+{
+	struct bpf_arena *arena = prog->aux->arena;
+
+	return arena ? &arena->map : NULL;
+}
+
 static long arena_map_peek_elem(struct bpf_map *map, void *value)
 {
 	return -EOPNOTSUPP;
-- 
2.54.0



^ permalink raw reply related	[flat|nested] 18+ messages in thread

* [PATCH 6/8] sched_ext: Require an arena for cid-form schedulers
  2026-05-20 23:50 [PATCHSET v3 sched_ext/for-7.2] bpf/arena: Direct kernel-side access Tejun Heo
                   ` (4 preceding siblings ...)
  2026-05-20 23:50 ` [PATCH 5/8] bpf/arena: Add bpf_arena_map_kern_vm_start() and bpf_prog_arena() Tejun Heo
@ 2026-05-20 23:50 ` Tejun Heo
  2026-05-21  4:15   ` Emil Tsalapatis
  2026-05-20 23:50 ` [PATCH 7/8] sched_ext: Sub-allocator over kernel-claimed BPF arena pages Tejun Heo
  2026-05-20 23:50 ` [PATCH 8/8] sched_ext: Convert ops.set_cmask() to arena-resident cmask Tejun Heo
  7 siblings, 1 reply; 18+ messages in thread
From: Tejun Heo @ 2026-05-20 23:50 UTC (permalink / raw)
  To: David Vernet, Andrea Righi, Changwoo Min, Alexei Starovoitov,
	Andrii Nakryiko, Daniel Borkmann, Martin KaFai Lau,
	Kumar Kartikeya Dwivedi
  Cc: Peter Zijlstra, Catalin Marinas, Will Deacon, Thomas Gleixner,
	Ingo Molnar, Borislav Petkov, Dave Hansen, Andrew Morton,
	David Hildenbrand, Mike Rapoport, Emil Tsalapatis, sched-ext, bpf,
	x86, linux-arm-kernel, linux-mm, linux-kernel, Tejun Heo

Upcoming patches will let the kernel place arena-resident scratch shared
with the BPF program (e.g. per-CPU set_cmask cmask) so the BPF side can
dereference it directly via __arena pointers, replacing the current
cmask_copy_from_kernel() probe-read loop. That requires each cid-form
scheduler to expose its arena to the kernel. Kernel- side accesses are
recovered by the per-arena scratch-page mechanism.

bpf_scx_reg_cid() walks the struct_ops member progs via
bpf_struct_ops_for_each_prog() and reads each prog's arena via
bpf_prog_arena(). The verifier enforces one arena per program, so each
member prog contributes at most one arena. All non-NULL contributions must
match and at least one member prog must use an arena. The map ref is held on
scx_sched and dropped on sched destroy. cpu-form schedulers (bpf_scx_reg)
are unchanged - no arena requirement.

Signed-off-by: Tejun Heo <tj@kernel.org>
---
 kernel/sched/ext.c          | 56 ++++++++++++++++++++++++++++++++++++-
 kernel/sched/ext_internal.h |  8 ++++++
 2 files changed, 63 insertions(+), 1 deletion(-)

diff --git a/kernel/sched/ext.c b/kernel/sched/ext.c
index 9c458552d14f..56f94ac32ba0 100644
--- a/kernel/sched/ext.c
+++ b/kernel/sched/ext.c
@@ -5003,6 +5003,8 @@ static void scx_sched_free_rcu_work(struct work_struct *work)
 
 	rhashtable_free_and_destroy(&sch->dsq_hash, NULL, NULL);
 	free_exit_info(sch->exit_info);
+	if (sch->arena_map)
+		bpf_map_put(sch->arena_map);
 	kfree(sch);
 }
 
@@ -6746,6 +6748,7 @@ struct scx_enable_cmd {
 		struct sched_ext_ops_cid	*ops_cid;
 	};
 	bool			is_cid_type;
+	struct bpf_map		*arena_map;	/* arena ref to transfer to sch */
 	int			ret;
 };
 
@@ -6913,6 +6916,15 @@ static struct scx_sched *scx_alloc_and_add_sched(struct scx_enable_cmd *cmd,
 		return ERR_PTR(ret);
 	}
 #endif	/* CONFIG_EXT_SUB_SCHED */
+
+	/*
+	 * Consume the arena_map ref bpf_scx_reg_cid() took. Defer to here so
+	 * earlier failure paths leave cmd->arena_map set and bpf_scx_reg_cid
+	 * drops the ref. After this point, sch owns the ref and any cleanup
+	 * runs through scx_sched_free_rcu_work() which puts it.
+	 */
+	sch->arena_map = cmd->arena_map;
+	cmd->arena_map = NULL;
 	return sch;
 
 #ifdef CONFIG_EXT_SUB_SCHED
@@ -7898,11 +7910,53 @@ static int bpf_scx_reg(void *kdata, struct bpf_link *link)
 	return scx_enable(&cmd, link);
 }
 
+struct scx_arena_scan {
+	struct bpf_map	*arena;
+	int		err;
+};
+
+/*
+ * The verifier enforces one arena per BPF program, so each struct_ops
+ * member prog contributes at most one arena via bpf_prog_arena().
+ * Require all non-NULL contributions to match.
+ */
+static int scx_arena_scan_prog(struct bpf_prog *prog, void *data)
+{
+	struct scx_arena_scan *s = data;
+	struct bpf_map *arena = bpf_prog_arena(prog);
+
+	if (!arena)
+		return 0;
+	if (s->arena && s->arena != arena) {
+		s->err = -EINVAL;
+		return 1;
+	}
+	s->arena = arena;
+	return 0;
+}
+
 static int bpf_scx_reg_cid(void *kdata, struct bpf_link *link)
 {
 	struct scx_enable_cmd cmd = { .ops_cid = kdata, .is_cid_type = true };
+	struct scx_arena_scan scan = {};
+	int ret;
 
-	return scx_enable(&cmd, link);
+	bpf_struct_ops_for_each_prog(kdata, scx_arena_scan_prog, &scan);
+	if (scan.err) {
+		pr_err("sched_ext: cid-form scheduler uses multiple arena maps\n");
+		return scan.err;
+	}
+	if (!scan.arena) {
+		pr_err("sched_ext: cid-form scheduler must use a BPF arena map\n");
+		return -EINVAL;
+	}
+
+	bpf_map_inc(scan.arena);
+	cmd.arena_map = scan.arena;
+	ret = scx_enable(&cmd, link);
+	if (cmd.arena_map)		/* not consumed by scx_alloc_and_add_sched() */
+		bpf_map_put(cmd.arena_map);
+	return ret;
 }
 
 static void bpf_scx_unreg(void *kdata, struct bpf_link *link)
diff --git a/kernel/sched/ext_internal.h b/kernel/sched/ext_internal.h
index 7258aea94b9f..d40cfd29ddaa 100644
--- a/kernel/sched/ext_internal.h
+++ b/kernel/sched/ext_internal.h
@@ -1111,6 +1111,14 @@ struct scx_sched {
 		struct sched_ext_ops_cid	ops_cid;
 	};
 	bool			is_cid_type;	/* true if registered via bpf_sched_ext_ops_cid */
+
+	/*
+	 * Arena map auto-discovered from member progs at struct_ops attach.
+	 * cid-form schedulers must use exactly one arena across all member
+	 * progs. NULL on cpu-form.
+	 */
+	struct bpf_map		*arena_map;
+
 	DECLARE_BITMAP(has_op, SCX_OPI_END);
 
 	/*
-- 
2.54.0



^ permalink raw reply related	[flat|nested] 18+ messages in thread

* [PATCH 7/8] sched_ext: Sub-allocator over kernel-claimed BPF arena pages
  2026-05-20 23:50 [PATCHSET v3 sched_ext/for-7.2] bpf/arena: Direct kernel-side access Tejun Heo
                   ` (5 preceding siblings ...)
  2026-05-20 23:50 ` [PATCH 6/8] sched_ext: Require an arena for cid-form schedulers Tejun Heo
@ 2026-05-20 23:50 ` Tejun Heo
  2026-05-21  7:56   ` Andrea Righi
  2026-05-20 23:50 ` [PATCH 8/8] sched_ext: Convert ops.set_cmask() to arena-resident cmask Tejun Heo
  7 siblings, 1 reply; 18+ messages in thread
From: Tejun Heo @ 2026-05-20 23:50 UTC (permalink / raw)
  To: David Vernet, Andrea Righi, Changwoo Min, Alexei Starovoitov,
	Andrii Nakryiko, Daniel Borkmann, Martin KaFai Lau,
	Kumar Kartikeya Dwivedi
  Cc: Peter Zijlstra, Catalin Marinas, Will Deacon, Thomas Gleixner,
	Ingo Molnar, Borislav Petkov, Dave Hansen, Andrew Morton,
	David Hildenbrand, Mike Rapoport, Emil Tsalapatis, sched-ext, bpf,
	x86, linux-arm-kernel, linux-mm, linux-kernel, Tejun Heo

Build a per-scheduler sub-allocator on top of pages claimed from the BPF
arena registered in the previous patch. Subsequent kernel-managed
arena-resident structures (e.g. per-CPU set_cmask cmask) carve their storage
from this pool.

scx_arena_pool_init() creates a gen_pool. scx_arena_alloc() returns the
kernel VA. On exhaustion, the pool grows by claiming more pages via
bpf_arena_alloc_pages_sleepable(). Chunks are added at the kernel-side
mapping address; callers translate to the BPF-arena form themselves if
needed.

Allocations sleep (GFP_KERNEL) - they may grow the pool through vzalloc and
arena page allocation. All current consumers run from the enable path (after
ops.init() and the kernel-side arena auto-discovery, before validate_ops()),
where sleeping is fine.

scx_arena_pool_destroy() walks each chunk, returns outstanding ranges to the
gen_pool with gen_pool_free() and then calls gen_pool_destroy(). The
underlying arena pages are released when the arena map itself is torn down,
so the pool destroy doesn't free them explicitly.

Signed-off-by: Tejun Heo <tj@kernel.org>
---
 kernel/sched/build_policy.c |   4 ++
 kernel/sched/ext.c          |  11 ++++
 kernel/sched/ext_arena.c    | 127 ++++++++++++++++++++++++++++++++++++
 kernel/sched/ext_arena.h    |  18 +++++
 kernel/sched/ext_internal.h |   5 ++
 5 files changed, 165 insertions(+)
 create mode 100644 kernel/sched/ext_arena.c
 create mode 100644 kernel/sched/ext_arena.h

diff --git a/kernel/sched/build_policy.c b/kernel/sched/build_policy.c
index 5e76c9177d54..067979a7b69e 100644
--- a/kernel/sched/build_policy.c
+++ b/kernel/sched/build_policy.c
@@ -59,12 +59,16 @@
 
 #ifdef CONFIG_SCHED_CLASS_EXT
 # include <linux/btf_ids.h>
+# include <linux/find.h>
+# include <linux/genalloc.h>
 # include "ext_types.h"
 # include "ext_internal.h"
 # include "ext_cid.h"
+# include "ext_arena.h"
 # include "ext_idle.h"
 # include "ext.c"
 # include "ext_cid.c"
+# include "ext_arena.c"
 # include "ext_idle.c"
 #endif
 
diff --git a/kernel/sched/ext.c b/kernel/sched/ext.c
index 56f94ac32ba0..fb91079c1244 100644
--- a/kernel/sched/ext.c
+++ b/kernel/sched/ext.c
@@ -5003,6 +5003,7 @@ static void scx_sched_free_rcu_work(struct work_struct *work)
 
 	rhashtable_free_and_destroy(&sch->dsq_hash, NULL, NULL);
 	free_exit_info(sch->exit_info);
+	scx_arena_pool_destroy(sch);
 	if (sch->arena_map)
 		bpf_map_put(sch->arena_map);
 	kfree(sch);
@@ -7155,6 +7156,12 @@ static void scx_root_enable_workfn(struct kthread_work *work)
 		sch->exit_info->flags |= SCX_EFLAG_INITIALIZED;
 	}
 
+	ret = scx_arena_pool_init(sch);
+	if (ret) {
+		cpus_read_unlock();
+		goto err_disable;
+	}
+
 	for (i = SCX_OPI_CPU_HOTPLUG_BEGIN; i < SCX_OPI_CPU_HOTPLUG_END; i++)
 		if (((void (**)(void))ops)[i])
 			set_bit(i, sch->has_op);
@@ -7473,6 +7480,10 @@ static void scx_sub_enable_workfn(struct kthread_work *work)
 		sch->exit_info->flags |= SCX_EFLAG_INITIALIZED;
 	}
 
+	ret = scx_arena_pool_init(sch);
+	if (ret)
+		goto err_disable;
+
 	if (validate_ops(sch, ops))
 		goto err_disable;
 
diff --git a/kernel/sched/ext_arena.c b/kernel/sched/ext_arena.c
new file mode 100644
index 000000000000..53174033765d
--- /dev/null
+++ b/kernel/sched/ext_arena.c
@@ -0,0 +1,127 @@
+/* SPDX-License-Identifier: GPL-2.0 */
+/*
+ * BPF extensible scheduler class: Documentation/scheduler/sched-ext.rst
+ *
+ * scx_arena_pool: kernel-side sub-allocator over BPF-arena pages.
+ *
+ * Each chunk added to @sch->arena_pool comes from one
+ * bpf_arena_alloc_pages_sleepable() call and is registered at the
+ * kernel-side mapping address. Callers translate to the BPF-arena form
+ * themselves if needed.
+ *
+ * Allocations grow the pool on demand. Underlying arena pages are released
+ * when the arena map itself is torn down.
+ *
+ * Copyright (c) 2026 Meta Platforms, Inc. and affiliates.
+ * Copyright (c) 2026 Tejun Heo <tj@kernel.org>
+ */
+
+enum scx_arena_consts {
+	SCX_ARENA_MIN_ORDER		= 3,	/* 8-byte minimum sub-allocation */
+	SCX_ARENA_GROW_PAGES		= 4,	/* per growth */
+};
+
+s32 scx_arena_pool_init(struct scx_sched *sch)
+{
+	if (!sch->arena_map)
+		return 0;
+
+	sch->arena_pool = gen_pool_create(SCX_ARENA_MIN_ORDER, NUMA_NO_NODE);
+	if (!sch->arena_pool)
+		return -ENOMEM;
+	return 0;
+}
+
+static void scx_arena_clear_chunk(struct gen_pool *pool, struct gen_pool_chunk *chunk,
+				  void *data)
+{
+	int order = pool->min_alloc_order;
+	size_t chunk_sz = chunk->end_addr - chunk->start_addr + 1;
+	unsigned long end_bit = chunk_sz >> order;
+	unsigned long b, e;
+
+	for_each_set_bitrange(b, e, chunk->bits, end_bit)
+		gen_pool_free(pool, chunk->start_addr + (b << order),
+			      (e - b) << order);
+}
+
+/*
+ * Tear down the pool. Outstanding gen_pool allocations are freed via
+ * scx_arena_clear_chunk() so gen_pool_destroy() doesn't BUG. The underlying
+ * arena pages are released when the arena map itself is torn down.
+ */
+void scx_arena_pool_destroy(struct scx_sched *sch)
+{
+	if (!sch->arena_pool)
+		return;
+	gen_pool_for_each_chunk(sch->arena_pool, scx_arena_clear_chunk, NULL);
+	gen_pool_destroy(sch->arena_pool);
+	sch->arena_pool = NULL;
+}
+
+/*
+ * Grow the pool by @page_cnt pages. bpf_arena_alloc_pages_sleepable() and
+ * gen_pool_add() (which calls vzalloc(GFP_KERNEL)) require a sleepable
+ * context.
+ */
+static int scx_arena_grow(struct scx_sched *sch, u32 page_cnt)
+{
+	u64 kern_vm_start;
+	u32 uaddr32;
+	void *p;
+	int ret;
+
+	if (!sch->arena_map || !sch->arena_pool)
+		return -EINVAL;
+
+	p = bpf_arena_alloc_pages_sleepable(sch->arena_map, NULL,
+					    page_cnt, NUMA_NO_NODE, 0);
+	if (!p)
+		return -ENOMEM;
+
+	uaddr32 = (u32)(unsigned long)p;
+	kern_vm_start = bpf_arena_map_kern_vm_start(sch->arena_map);
+
+	ret = gen_pool_add(sch->arena_pool, kern_vm_start + uaddr32,
+			   page_cnt * PAGE_SIZE, NUMA_NO_NODE);
+	if (ret) {
+		bpf_arena_free_pages_non_sleepable(sch->arena_map, p, page_cnt);
+		return ret;
+	}
+	return 0;
+}
+
+/*
+ * Allocate @size bytes from the arena pool. Returns kernel VA on success, NULL
+ * on failure. May grow the pool via scx_arena_grow() which sleeps. Caller must
+ * be in a GFP_KERNEL context.
+ */
+void *scx_arena_alloc(struct scx_sched *sch, size_t size)
+{
+	unsigned long kern_va;
+	u32 page_cnt;
+
+	might_sleep();
+
+	if (!sch->arena_pool)
+		return NULL;
+
+	kern_va = gen_pool_alloc(sch->arena_pool, size);
+	if (!kern_va) {
+		page_cnt = max_t(u32, SCX_ARENA_GROW_PAGES,
+				 (size + PAGE_SIZE - 1) >> PAGE_SHIFT);
+		if (scx_arena_grow(sch, page_cnt))
+			return NULL;
+		kern_va = gen_pool_alloc(sch->arena_pool, size);
+		if (!kern_va)
+			return NULL;
+	}
+
+	return (void *)kern_va;
+}
+
+void scx_arena_free(struct scx_sched *sch, void *kern_va, size_t size)
+{
+	if (sch->arena_pool && kern_va)
+		gen_pool_free(sch->arena_pool, (unsigned long)kern_va, size);
+}
diff --git a/kernel/sched/ext_arena.h b/kernel/sched/ext_arena.h
new file mode 100644
index 000000000000..4f3610160102
--- /dev/null
+++ b/kernel/sched/ext_arena.h
@@ -0,0 +1,18 @@
+/* SPDX-License-Identifier: GPL-2.0 */
+/*
+ * BPF extensible scheduler class: Documentation/scheduler/sched-ext.rst
+ *
+ * Copyright (c) 2025 Meta Platforms, Inc. and affiliates.
+ * Copyright (c) 2025 Tejun Heo <tj@kernel.org>
+ */
+#ifndef _KERNEL_SCHED_EXT_ARENA_H
+#define _KERNEL_SCHED_EXT_ARENA_H
+
+struct scx_sched;
+
+s32 scx_arena_pool_init(struct scx_sched *sch);
+void scx_arena_pool_destroy(struct scx_sched *sch);
+void *scx_arena_alloc(struct scx_sched *sch, size_t size);
+void scx_arena_free(struct scx_sched *sch, void *kern_va, size_t size);
+
+#endif /* _KERNEL_SCHED_EXT_ARENA_H */
diff --git a/kernel/sched/ext_internal.h b/kernel/sched/ext_internal.h
index d40cfd29ddaa..ff7e882bd67a 100644
--- a/kernel/sched/ext_internal.h
+++ b/kernel/sched/ext_internal.h
@@ -1116,8 +1116,13 @@ struct scx_sched {
 	 * Arena map auto-discovered from member progs at struct_ops attach.
 	 * cid-form schedulers must use exactly one arena across all member
 	 * progs. NULL on cpu-form.
+	 *
+	 * @arena_pool sub-allocates @arena_map. Each gen_pool chunk is added
+	 * at the kernel-side mapping address. Grows on demand and pages are
+	 * not released until sched destroy.
 	 */
 	struct bpf_map		*arena_map;
+	struct gen_pool		*arena_pool;
 
 	DECLARE_BITMAP(has_op, SCX_OPI_END);
 
-- 
2.54.0



^ permalink raw reply related	[flat|nested] 18+ messages in thread

* [PATCH 8/8] sched_ext: Convert ops.set_cmask() to arena-resident cmask
  2026-05-20 23:50 [PATCHSET v3 sched_ext/for-7.2] bpf/arena: Direct kernel-side access Tejun Heo
                   ` (6 preceding siblings ...)
  2026-05-20 23:50 ` [PATCH 7/8] sched_ext: Sub-allocator over kernel-claimed BPF arena pages Tejun Heo
@ 2026-05-20 23:50 ` Tejun Heo
  2026-05-21  4:19   ` Emil Tsalapatis
  7 siblings, 1 reply; 18+ messages in thread
From: Tejun Heo @ 2026-05-20 23:50 UTC (permalink / raw)
  To: David Vernet, Andrea Righi, Changwoo Min, Alexei Starovoitov,
	Andrii Nakryiko, Daniel Borkmann, Martin KaFai Lau,
	Kumar Kartikeya Dwivedi
  Cc: Peter Zijlstra, Catalin Marinas, Will Deacon, Thomas Gleixner,
	Ingo Molnar, Borislav Petkov, Dave Hansen, Andrew Morton,
	David Hildenbrand, Mike Rapoport, Emil Tsalapatis, sched-ext, bpf,
	x86, linux-arm-kernel, linux-mm, linux-kernel, Tejun Heo

ops_cid.set_cmask() expects a cmask. The kernel couldn't write into the
arena, so it translated cpumask -> cmask in kernel memory and passed the
result as a trusted pointer. The BPF cmask helpers all operate on arena
cmasks though, so the BPF side had to word-by-word probe-read the kernel
cmask into an arena cmask via cmask_copy_from_kernel() before any helper
could touch it. It works, but is clumsy.

With direct kernel-side arena access now in place, build the cmask in the
arena. The kernel writes to it through the kern_va side of the dual mapping;
BPF directly dereferences it via an __arena pointer like any other arena
struct.

Signed-off-by: Tejun Heo <tj@kernel.org>
---
 kernel/sched/ext.c                    | 68 +++++++++++++++++++++++++--
 kernel/sched/ext_cid.c                | 20 +-------
 kernel/sched/ext_internal.h           | 10 +++-
 tools/sched_ext/include/scx/cid.bpf.h | 52 --------------------
 tools/sched_ext/scx_qmap.bpf.c        |  5 +-
 5 files changed, 75 insertions(+), 80 deletions(-)

diff --git a/kernel/sched/ext.c b/kernel/sched/ext.c
index fb91079c1244..94562e3350c6 100644
--- a/kernel/sched/ext.c
+++ b/kernel/sched/ext.c
@@ -621,11 +621,16 @@ static inline void scx_call_op_set_cpumask(struct scx_sched *sch, struct rq *rq,
 		update_locked_rq(rq);
 
 	if (scx_is_cid_type()) {
-		struct scx_cmask *cmask = this_cpu_ptr(scx_set_cmask_scratch);
-
-		lockdep_assert_irqs_disabled();
-		scx_cpumask_to_cmask(cpumask, cmask);
-		sch->ops_cid.set_cmask(task, cmask);
+		struct scx_cmask *kern_va = *this_cpu_ptr(sch->set_cmask_scratch);
+		unsigned long uaddr = (unsigned long)kern_va -
+			bpf_arena_map_kern_vm_start(sch->arena_map);
+		/*
+		 * Build the per-CPU arena cmask and hand BPF the uaddr. Caller
+		 * holds the rq lock with IRQs disabled, which makes us the sole
+		 * user of the scratch area.
+		 */
+		scx_cpumask_to_cmask(cpumask, kern_va);
+		sch->ops_cid.set_cmask(task, (struct scx_cmask *)uaddr);
 	} else {
 		sch->ops.set_cpumask(task, cpumask);
 	}
@@ -4949,6 +4954,48 @@ static const struct attribute_group scx_global_attr_group = {
 static void free_pnode(struct scx_sched_pnode *pnode);
 static void free_exit_info(struct scx_exit_info *ei);
 
+static s32 scx_set_cmask_scratch_alloc(struct scx_sched *sch)
+{
+	size_t size = struct_size_t(struct scx_cmask, bits,
+				    SCX_CMASK_NR_WORDS(num_possible_cpus()));
+	int cpu;
+
+	if (!sch->is_cid_type || !sch->arena_pool)
+		return 0;
+
+	sch->set_cmask_scratch = alloc_percpu(struct scx_cmask *);
+	if (!sch->set_cmask_scratch)
+		return -ENOMEM;
+
+	for_each_possible_cpu(cpu) {
+		struct scx_cmask **slot = per_cpu_ptr(sch->set_cmask_scratch, cpu);
+
+		*slot = scx_arena_alloc(sch, size);
+		if (!*slot)
+			return -ENOMEM;
+		scx_cmask_init(*slot, 0, num_possible_cpus());
+	}
+	return 0;
+}
+
+static void scx_set_cmask_scratch_free(struct scx_sched *sch)
+{
+	size_t size = struct_size_t(struct scx_cmask, bits,
+				    SCX_CMASK_NR_WORDS(num_possible_cpus()));
+	int cpu;
+
+	if (!sch->set_cmask_scratch)
+		return;
+
+	for_each_possible_cpu(cpu) {
+		struct scx_cmask **slot = per_cpu_ptr(sch->set_cmask_scratch, cpu);
+
+		scx_arena_free(sch, *slot, size);
+	}
+	free_percpu(sch->set_cmask_scratch);
+	sch->set_cmask_scratch = NULL;
+}
+
 static void scx_sched_free_rcu_work(struct work_struct *work)
 {
 	struct rcu_work *rcu_work = to_rcu_work(work);
@@ -5003,6 +5050,7 @@ static void scx_sched_free_rcu_work(struct work_struct *work)
 
 	rhashtable_free_and_destroy(&sch->dsq_hash, NULL, NULL);
 	free_exit_info(sch->exit_info);
+	scx_set_cmask_scratch_free(sch);
 	scx_arena_pool_destroy(sch);
 	if (sch->arena_map)
 		bpf_map_put(sch->arena_map);
@@ -7162,6 +7210,12 @@ static void scx_root_enable_workfn(struct kthread_work *work)
 		goto err_disable;
 	}
 
+	ret = scx_set_cmask_scratch_alloc(sch);
+	if (ret) {
+		cpus_read_unlock();
+		goto err_disable;
+	}
+
 	for (i = SCX_OPI_CPU_HOTPLUG_BEGIN; i < SCX_OPI_CPU_HOTPLUG_END; i++)
 		if (((void (**)(void))ops)[i])
 			set_bit(i, sch->has_op);
@@ -7484,6 +7538,10 @@ static void scx_sub_enable_workfn(struct kthread_work *work)
 	if (ret)
 		goto err_disable;
 
+	ret = scx_set_cmask_scratch_alloc(sch);
+	if (ret)
+		goto err_disable;
+
 	if (validate_ops(sch, ops))
 		goto err_disable;
 
diff --git a/kernel/sched/ext_cid.c b/kernel/sched/ext_cid.c
index 0c91b951fd33..808c6390da5a 100644
--- a/kernel/sched/ext_cid.c
+++ b/kernel/sched/ext_cid.c
@@ -7,14 +7,6 @@
  */
 #include <linux/cacheinfo.h>
 
-/*
- * Per-cpu scratch cmask used by scx_call_op_set_cpumask() to synthesize a
- * cmask from a cpumask. Allocated alongside the cid arrays on first enable
- * and never freed. Sized to the full cid space. Caller holds rq lock so
- * this_cpu_ptr is safe.
- */
-struct scx_cmask __percpu *scx_set_cmask_scratch;
-
 /*
  * cid tables.
  *
@@ -54,8 +46,6 @@ static s32 scx_cid_arrays_alloc(void)
 	u32 npossible = num_possible_cpus();
 	s16 *cid_to_cpu, *cpu_to_cid;
 	struct scx_cid_topo *cid_topo;
-	struct scx_cmask __percpu *set_cmask_scratch;
-	s32 cpu;
 
 	if (scx_cid_to_cpu_tbl)
 		return 0;
@@ -63,25 +53,17 @@ static s32 scx_cid_arrays_alloc(void)
 	cid_to_cpu = kzalloc_objs(*scx_cid_to_cpu_tbl, npossible, GFP_KERNEL);
 	cpu_to_cid = kzalloc_objs(*scx_cpu_to_cid_tbl, nr_cpu_ids, GFP_KERNEL);
 	cid_topo = kmalloc_objs(*scx_cid_topo, npossible, GFP_KERNEL);
-	set_cmask_scratch = __alloc_percpu(struct_size(set_cmask_scratch, bits,
-						       SCX_CMASK_NR_WORDS(npossible)),
-					   sizeof(u64));
 
-	if (!cid_to_cpu || !cpu_to_cid || !cid_topo || !set_cmask_scratch) {
+	if (!cid_to_cpu || !cpu_to_cid || !cid_topo) {
 		kfree(cid_to_cpu);
 		kfree(cpu_to_cid);
 		kfree(cid_topo);
-		free_percpu(set_cmask_scratch);
 		return -ENOMEM;
 	}
 
 	WRITE_ONCE(scx_cid_to_cpu_tbl, cid_to_cpu);
 	WRITE_ONCE(scx_cpu_to_cid_tbl, cpu_to_cid);
 	WRITE_ONCE(scx_cid_topo, cid_topo);
-	for_each_possible_cpu(cpu)
-		scx_cmask_init(per_cpu_ptr(set_cmask_scratch, cpu),
-			       0, npossible);
-	WRITE_ONCE(scx_set_cmask_scratch, set_cmask_scratch);
 	return 0;
 }
 
diff --git a/kernel/sched/ext_internal.h b/kernel/sched/ext_internal.h
index ff7e882bd67a..9bb65367f510 100644
--- a/kernel/sched/ext_internal.h
+++ b/kernel/sched/ext_internal.h
@@ -1124,6 +1124,14 @@ struct scx_sched {
 	struct bpf_map		*arena_map;
 	struct gen_pool		*arena_pool;
 
+	/*
+	 * Per-CPU arena cmask used by scx_call_op_set_cpumask() to hand a cmask
+	 * to ops_cid.set_cmask(). The kernel writes through the stored kern_va;
+	 * the BPF-arena uaddr handed to BPF is recovered by subtracting the
+	 * arena's kern_vm_start.
+	 */
+	struct scx_cmask * __percpu *set_cmask_scratch;
+
 	DECLARE_BITMAP(has_op, SCX_OPI_END);
 
 	/*
@@ -1480,8 +1488,6 @@ enum scx_ops_state {
 extern struct scx_sched __rcu *scx_root;
 DECLARE_PER_CPU(struct rq *, scx_locked_rq_state);
 
-extern struct scx_cmask __percpu *scx_set_cmask_scratch;
-
 /*
  * True when the currently loaded scheduler hierarchy is cid-form. All scheds
  * in a hierarchy share one form, so this single key tells callsites which
diff --git a/tools/sched_ext/include/scx/cid.bpf.h b/tools/sched_ext/include/scx/cid.bpf.h
index e281c88fa824..70f2a3829af4 100644
--- a/tools/sched_ext/include/scx/cid.bpf.h
+++ b/tools/sched_ext/include/scx/cid.bpf.h
@@ -675,56 +675,4 @@ static __always_inline void cmask_from_cpumask(struct scx_cmask __arena *m,
 	}
 }
 
-/**
- * cmask_copy_from_kernel - probe-read a kernel cmask into an arena cmask
- * @dst: arena cmask to fill; must have @dst->base == 0 and be sized for @src.
- * @src: kernel-memory cmask (e.g. ops.set_cmask() arg); @src->base must be 0.
- *
- * Word-for-word copy; @src and @dst must share base 0 alignment. Triggers
- * scx_bpf_error() on probe failure or precondition violation.
- */
-static __always_inline void cmask_copy_from_kernel(struct scx_cmask __arena *dst,
-						   const struct scx_cmask *src)
-{
-	u32 base = 0, nr_cids = 0, nr_words, wi;
-
-	if (dst->base != 0) {
-		scx_bpf_error("cmask_copy_from_kernel requires dst->base == 0");
-		return;
-	}
-
-	if (bpf_probe_read_kernel(&base, sizeof(base), &src->base)) {
-		scx_bpf_error("probe-read cmask->base failed");
-		return;
-	}
-	if (base != 0) {
-		scx_bpf_error("cmask_copy_from_kernel requires src->base == 0");
-		return;
-	}
-
-	if (bpf_probe_read_kernel(&nr_cids, sizeof(nr_cids), &src->nr_cids)) {
-		scx_bpf_error("probe-read cmask->nr_cids failed");
-		return;
-	}
-
-	if (nr_cids > dst->nr_cids) {
-		scx_bpf_error("src cmask nr_cids=%u exceeds dst nr_cids=%u",
-			      nr_cids, dst->nr_cids);
-		return;
-	}
-
-	nr_words = CMASK_NR_WORDS(nr_cids);
-	cmask_zero(dst);
-	bpf_for(wi, 0, CMASK_MAX_WORDS) {
-		u64 word = 0;
-		if (wi >= nr_words)
-			break;
-		if (bpf_probe_read_kernel(&word, sizeof(u64), &src->bits[wi])) {
-			scx_bpf_error("probe-read cmask->bits[%u] failed", wi);
-			return;
-		}
-		dst->bits[wi] = word;
-	}
-}
-
 #endif /* __SCX_CID_BPF_H */
diff --git a/tools/sched_ext/scx_qmap.bpf.c b/tools/sched_ext/scx_qmap.bpf.c
index 7e77f22674ea..8a2d6a8ebd8e 100644
--- a/tools/sched_ext/scx_qmap.bpf.c
+++ b/tools/sched_ext/scx_qmap.bpf.c
@@ -919,14 +919,15 @@ void BPF_STRUCT_OPS(qmap_update_idle, s32 cid, bool idle)
 }
 
 void BPF_STRUCT_OPS(qmap_set_cmask, struct task_struct *p,
-		    const struct scx_cmask *cmask)
+		    const struct scx_cmask *cmask_in)
 {
+	struct scx_cmask __arena *cmask = (struct scx_cmask __arena *)(long)cmask_in;
 	task_ctx_t *taskc;
 
 	taskc = lookup_task_ctx(p);
 	if (!taskc)
 		return;
-	cmask_copy_from_kernel(&taskc->cpus_allowed, cmask);
+	cmask_copy(&taskc->cpus_allowed, cmask);
 }
 
 struct monitor_timer {
-- 
2.54.0



^ permalink raw reply related	[flat|nested] 18+ messages in thread

* Re: [PATCH 2/8] bpf: Recover arena kernel faults with scratch page
  2026-05-20 23:50 ` [PATCH 2/8] bpf: Recover arena kernel faults with scratch page Tejun Heo
@ 2026-05-21  3:16   ` Emil Tsalapatis
  0 siblings, 0 replies; 18+ messages in thread
From: Emil Tsalapatis @ 2026-05-21  3:16 UTC (permalink / raw)
  To: Tejun Heo, David Vernet, Andrea Righi, Changwoo Min,
	Alexei Starovoitov, Andrii Nakryiko, Daniel Borkmann,
	Martin KaFai Lau, Kumar Kartikeya Dwivedi
  Cc: Peter Zijlstra, Catalin Marinas, Will Deacon, Thomas Gleixner,
	Ingo Molnar, Borislav Petkov, Dave Hansen, Andrew Morton,
	David Hildenbrand, Mike Rapoport, Emil Tsalapatis, sched-ext, bpf,
	x86, linux-arm-kernel, linux-mm, linux-kernel

On Wed May 20, 2026 at 7:50 PM EDT, Tejun Heo wrote:
> From: Kumar Kartikeya Dwivedi <memxor@gmail.com>
>
> BPF arena usage is becoming more prevalent, but kernel <-> BPF communication
> over arena memory is awkward today. Data has to be staged through a trusted
> kernel pointer with extra code and copying on the BPF side. While reads
> through arena pointers can use a fault-safe helper, writes don't have a good
> solution. The in-line alternative would need instruction emulation or asm
> fixup labels.
>
> Enable direct kernel-side reads and writes within GUARD_SZ / 2 of any
> handed-in arena pointer, without bounds checking. A per-arena scratch page
> is installed by the arch fault path into empty arena kernel PTEs - x86 from
> page_fault_oops() for not-present faults, arm64 from __do_kernel_fault() for
> translation faults, both after the existing exception-table and KFENCE
> handling. The faulting instruction retries and the access is also reported
> through the program's BPF stream, preserving error reporting.
>
> bpf_prog_find_from_stack() resolves the current BPF program (and its arena)
> from the kernel stack - no new bpf_run_ctx state is added. Recovery covers
> the 4 GiB arena plus the upper half-guard (GUARD_SZ / 2). The lower
> half-guard is excluded because well-behaved kfuncs only access forward from
> arena pointers. The kfunc-author contract - access at most GUARD_SZ / 2 past
> a handed-in pointer - is documented in Documentation/bpf/kfuncs.rst.
>
> The install is lock-free via ptep_try_set(). On race-loss the winning
> installer's PTE is already valid, so the access retry succeeds. The arena
> clear path uses ptep_get_and_clear() so installer and clearer race through
> atomic accessors. No flush_tlb_kernel_range() afterwards. Stale "not mapped"
> entries just cause one extra re-fault, cheaper than a global IPI on every
> install.
>
> Scratch exists only to keep the kernel from oopsing on an in-line arena
> access. Its presence at a PTE means the BPF program has already
> malfunctioned, and the violation is reported through the program's BPF
> stream. The only requirement for behavior on a scratched PTE is that the
> kernel doesn't crash. In particular, any user-side access through such a PTE
> may segfault. The shared scratch page is freed once during map destruction.
>
> BPF instruction faults continue to use the existing JIT exception-table
> path. This patch changes only the kernel-text fault path. No UAPI flag is
> added. The new behavior is the default.
>
> v2: Use ptep_get_and_clear() in apply_range_clear_cb(). (David)
>
> Suggested-by: Alexei Starovoitov <ast@kernel.org>
> Signed-off-by: Kumar Kartikeya Dwivedi <memxor@gmail.com>
> Signed-off-by: Tejun Heo <tj@kernel.org>
> Cc: David Hildenbrand <david@kernel.org>
> ---

Reviewed-by: Emil Tsalapatis <emil@etsalapatis.com>

>  Documentation/bpf/kfuncs.rst |  14 +++
>  arch/arm64/mm/fault.c        |  10 +-
>  arch/x86/mm/fault.c          |  12 ++-
>  include/linux/bpf.h          |   1 +
>  include/linux/bpf_defs.h     |  11 +++
>  kernel/bpf/arena.c           | 177 +++++++++++++++++++++++++++--------
>  kernel/bpf/core.c            |   5 +
>  7 files changed, 183 insertions(+), 47 deletions(-)
>  create mode 100644 include/linux/bpf_defs.h
>
> diff --git a/Documentation/bpf/kfuncs.rst b/Documentation/bpf/kfuncs.rst
> index 75e6c078e0e7..6d497e720998 100644
> --- a/Documentation/bpf/kfuncs.rst
> +++ b/Documentation/bpf/kfuncs.rst
> @@ -462,6 +462,20 @@ In order to accommodate such requirements, the verifier will enforce strict
>  PTR_TO_BTF_ID type matching if two types have the exact same name, with one
>  being suffixed with ``___init``.
>  
> +2.8 Accessing arena memory through kfunc arguments
> +--------------------------------------------------
> +
> +A read or write at any address inside an arena does not oops the kernel.
> +Unallocated arena pages are lazily backed by a scratch page and the
> +access is reported through the program's BPF stream as an error. Only
> +the BPF program's correctness is affected; the kernel itself remains
> +intact.
> +
> +The arena is followed by a ``GUARD_SZ / 2`` (32 KiB) guard region that
> +is also covered by this recovery. A kfunc handed an arena pointer may
> +therefore access up to ``GUARD_SZ / 2`` past it without bounds-checking
> +against the arena. Larger accesses must verify the range explicitly.
> +
>  .. _BPF_kfunc_lifecycle_expectations:
>  
>  3. kfunc lifecycle expectations
> diff --git a/arch/arm64/mm/fault.c b/arch/arm64/mm/fault.c
> index 920a8b244d59..0d58d667fcd8 100644
> --- a/arch/arm64/mm/fault.c
> +++ b/arch/arm64/mm/fault.c
> @@ -9,6 +9,7 @@
>  
>  #include <linux/acpi.h>
>  #include <linux/bitfield.h>
> +#include <linux/bpf_defs.h>
>  #include <linux/extable.h>
>  #include <linux/kfence.h>
>  #include <linux/signal.h>
> @@ -416,9 +417,12 @@ static void __do_kernel_fault(unsigned long addr, unsigned long esr,
>  	} else if (addr < PAGE_SIZE) {
>  		msg = "NULL pointer dereference";
>  	} else {
> -		if (esr_fsc_is_translation_fault(esr) &&
> -		    kfence_handle_page_fault(addr, esr & ESR_ELx_WNR, regs))
> -			return;
> +		if (esr_fsc_is_translation_fault(esr)) {
> +			if (kfence_handle_page_fault(addr, esr & ESR_ELx_WNR, regs))
> +				return;
> +			if (bpf_arena_handle_page_fault(addr, esr & ESR_ELx_WNR, regs->pc))
> +				return;
> +		}
>  
>  		msg = "paging request";
>  	}
> diff --git a/arch/x86/mm/fault.c b/arch/x86/mm/fault.c
> index f0e77e084482..b0f103ddbd23 100644
> --- a/arch/x86/mm/fault.c
> +++ b/arch/x86/mm/fault.c
> @@ -8,6 +8,7 @@
>  #include <linux/sched/task_stack.h>	/* task_stack_*(), ...		*/
>  #include <linux/kdebug.h>		/* oops_begin/end, ...		*/
>  #include <linux/memblock.h>		/* max_low_pfn			*/
> +#include <linux/bpf_defs.h>		/* bpf_arena_handle_page_fault	*/
>  #include <linux/kfence.h>		/* kfence_handle_page_fault	*/
>  #include <linux/kprobes.h>		/* NOKPROBE_SYMBOL, ...		*/
>  #include <linux/mmiotrace.h>		/* kmmio_handler, ...		*/
> @@ -688,10 +689,13 @@ page_fault_oops(struct pt_regs *regs, unsigned long error_code,
>  	if (IS_ENABLED(CONFIG_EFI))
>  		efi_crash_gracefully_on_page_fault(address);
>  
> -	/* Only not-present faults should be handled by KFENCE. */
> -	if (!(error_code & X86_PF_PROT) &&
> -	    kfence_handle_page_fault(address, error_code & X86_PF_WRITE, regs))
> -		return;
> +	/* Only not-present faults should be handled by KFENCE or BPF arena. */
> +	if (!(error_code & X86_PF_PROT)) {
> +		if (kfence_handle_page_fault(address, error_code & X86_PF_WRITE, regs))
> +			return;
> +		if (bpf_arena_handle_page_fault(address, error_code & X86_PF_WRITE, regs->ip))
> +			return;
> +	}
>  
>  oops:
>  	/*
> diff --git a/include/linux/bpf.h b/include/linux/bpf.h
> index 0136a108d083..831996c411cf 100644
> --- a/include/linux/bpf.h
> +++ b/include/linux/bpf.h
> @@ -6,6 +6,7 @@
>  
>  #include <uapi/linux/bpf.h>
>  #include <uapi/linux/filter.h>
> +#include <linux/bpf_defs.h>
>  
>  #include <crypto/sha2.h>
>  #include <linux/workqueue.h>
> diff --git a/include/linux/bpf_defs.h b/include/linux/bpf_defs.h
> new file mode 100644
> index 000000000000..d98e033b8c0b
> --- /dev/null
> +++ b/include/linux/bpf_defs.h
> @@ -0,0 +1,11 @@
> +/* SPDX-License-Identifier: GPL-2.0-or-later */
> +/*
> + * Subset of bpf.h declarations, split out so files that need only these
> + * declarations can avoid bpf.h's full include cost.
> + */
> +#ifndef _LINUX_BPF_DEFS_H
> +#define _LINUX_BPF_DEFS_H
> +
> +bool bpf_arena_handle_page_fault(unsigned long addr, bool is_write, unsigned long fault_ip);
> +
> +#endif /* _LINUX_BPF_DEFS_H */
> diff --git a/kernel/bpf/arena.c b/kernel/bpf/arena.c
> index 08d008cc471e..1c0b87ecc817 100644
> --- a/kernel/bpf/arena.c
> +++ b/kernel/bpf/arena.c
> @@ -53,6 +53,7 @@ struct bpf_arena {
>  	u64 user_vm_start;
>  	u64 user_vm_end;
>  	struct vm_struct *kern_vm;
> +	struct page *scratch_page;
>  	struct range_tree rt;
>  	/* protects rt */
>  	rqspinlock_t spinlock;
> @@ -118,6 +119,11 @@ struct apply_range_data {
>  	int i;
>  };
>  
> +struct clear_range_data {
> +	struct llist_head *free_pages;
> +	struct page *scratch_page;
> +};
> +
>  static int apply_range_set_cb(pte_t *pte, unsigned long addr, void *data)
>  {
>  	struct apply_range_data *d = data;
> @@ -144,33 +150,59 @@ static void flush_vmap_cache(unsigned long start, unsigned long size)
>  	flush_cache_vmap(start, start + size);
>  }
>  
> -static int apply_range_clear_cb(pte_t *pte, unsigned long addr, void *free_pages)
> +static int apply_range_clear_cb(pte_t *pte, unsigned long addr, void *data)
>  {
> +	struct clear_range_data *d = data;
>  	pte_t old_pte;
>  	struct page *page;
>  
> -	/* sanity check */
> -	old_pte = ptep_get(pte);
> +	/*
> +	 * Pairs with ptep_try_set() in the kernel-fault scratch installer.
> +	 * Both sides must be atomic.
> +	 */
> +	old_pte = ptep_get_and_clear(&init_mm, addr, pte);
>  	if (pte_none(old_pte) || !pte_present(old_pte))
> -		return 0; /* nothing to do */
> +		return 0;
>  
>  	page = pte_page(old_pte);
>  	if (WARN_ON_ONCE(!page))
>  		return -EINVAL;
>  
> -	pte_clear(&init_mm, addr, pte);
> +	/*
> +	 * Skip the per-arena scratch page. A kernel fault on an unallocated uaddr
> +	 * scratches its PTE. A later bpf_arena_free_pages() over that range walks
> +	 * here. Without the skip, scratch_page would be freed.
> +	 */
> +	if (page == d->scratch_page)
> +		return 0;
> +
> +	__llist_add(&page->pcp_llist, d->free_pages);
> +	return 0;
> +}
>  
> -	/* Add page to the list so it is freed later */
> -	if (free_pages)
> -		__llist_add(&page->pcp_llist, free_pages);
> +static int apply_range_set_scratch_cb(pte_t *pte, unsigned long addr, void *data)
> +{
> +	struct page *scratch_page = data;
>  
> +	if (!pte_none(ptep_get(pte)))
> +		return 0;
> +	/*
> +	 * Best-effort install. ptep_try_set() returns false only if another
> +	 * installer (real allocation or concurrent fault) won the cmpxchg.
> +	 * Their PTE is already valid, so the access retry succeeds.
> +	 *
> +	 * No flush_tlb_kernel_range() needed. Stale "not mapped" entries just
> +	 * cause one extra re-fault through this same path.
> +	 */
> +	ptep_try_set(pte, mk_pte(scratch_page, PAGE_KERNEL));
>  	return 0;
>  }
>  
>  static int populate_pgtable_except_pte(struct bpf_arena *arena)
>  {
> +	/* Populate intermediates for the recovery range (4 GiB + upper half-guard). */
>  	return apply_to_page_range(&init_mm, bpf_arena_get_kern_vm_start(arena),
> -				   KERN_VM_SZ - GUARD_SZ, apply_range_set_cb, NULL);
> +				   SZ_4G + GUARD_SZ / 2, apply_range_set_cb, NULL);
>  }
>  
>  static struct bpf_map *arena_map_alloc(union bpf_attr *attr)
> @@ -221,22 +253,29 @@ static struct bpf_map *arena_map_alloc(union bpf_attr *attr)
>  	init_irq_work(&arena->free_irq, arena_free_irq);
>  	INIT_WORK(&arena->free_work, arena_free_worker);
>  	bpf_map_init_from_attr(&arena->map, attr);
> +
> +	err = bpf_map_alloc_pages(&arena->map, NUMA_NO_NODE, 1, &arena->scratch_page);
> +	if (err)
> +		goto err_free_arena;
> +
>  	range_tree_init(&arena->rt);
>  	err = range_tree_set(&arena->rt, 0, attr->max_entries);
> -	if (err) {
> -		bpf_map_area_free(arena);
> -		goto err;
> -	}
> +	if (err)
> +		goto err_free_scratch;
>  	mutex_init(&arena->lock);
>  	raw_res_spin_lock_init(&arena->spinlock);
>  	err = populate_pgtable_except_pte(arena);
> -	if (err) {
> -		range_tree_destroy(&arena->rt);
> -		bpf_map_area_free(arena);
> -		goto err;
> -	}
> +	if (err)
> +		goto err_destroy_rt;
>  
>  	return &arena->map;
> +
> +err_destroy_rt:
> +	range_tree_destroy(&arena->rt);
> +err_free_scratch:
> +	__free_page(arena->scratch_page);
> +err_free_arena:
> +	bpf_map_area_free(arena);
>  err:
>  	free_vm_area(kern_vm);
>  	return ERR_PTR(err);
> @@ -244,6 +283,7 @@ static struct bpf_map *arena_map_alloc(union bpf_attr *attr)
>  
>  static int existing_page_cb(pte_t *ptep, unsigned long addr, void *data)
>  {
> +	struct bpf_arena *arena = data;
>  	struct page *page;
>  	pte_t pte;
>  
> @@ -251,6 +291,12 @@ static int existing_page_cb(pte_t *ptep, unsigned long addr, void *data)
>  	if (!pte_present(pte)) /* sanity check */
>  		return 0;
>  	page = pte_page(pte);
> +	/*
> +	 * Skip the scratch page. The walk is page-table-driven, not range-tree-driven,
> +	 * so it can visit scratch PTEs at uaddrs the BPF program never allocated.
> +	 */
> +	if (page == arena->scratch_page)
> +		return 0;
>  	/*
>  	 * We do not update pte here:
>  	 * 1. Nobody should be accessing bpf_arena's range outside of a kernel bug
> @@ -286,9 +332,10 @@ static void arena_map_free(struct bpf_map *map)
>  	 * free those pages.
>  	 */
>  	apply_to_existing_page_range(&init_mm, bpf_arena_get_kern_vm_start(arena),
> -				     KERN_VM_SZ - GUARD_SZ, existing_page_cb, NULL);
> +				     SZ_4G + GUARD_SZ / 2, existing_page_cb, arena);
>  	free_vm_area(arena->kern_vm);
>  	range_tree_destroy(&arena->rt);
> +	__free_page(arena->scratch_page);
>  	bpf_map_area_free(arena);
>  }
>  
> @@ -374,33 +421,37 @@ static vm_fault_t arena_vm_fault(struct vm_fault *vmf)
>  		return VM_FAULT_RETRY;
>  
>  	page = vmalloc_to_page((void *)kaddr);
> -	if (page)
> +	if (page) {
> +		if (page == arena->scratch_page)
> +			/* BPF triggered scratch here; don't lazy-alloc over it */
> +			goto out_sigsegv;
>  		/* already have a page vmap-ed */
>  		goto out;
> +	}
>  
>  	bpf_map_memcg_enter(&arena->map, &old_memcg, &new_memcg);
>  
>  	if (arena->map.map_flags & BPF_F_SEGV_ON_FAULT)
>  		/* User space requested to segfault when page is not allocated by bpf prog */
> -		goto out_unlock_sigsegv;
> +		goto out_sigsegv_memcg;
>  
>  	ret = range_tree_clear(&arena->rt, vmf->pgoff, 1);
>  	if (ret)
> -		goto out_unlock_sigsegv;
> +		goto out_sigsegv_memcg;
>  
>  	struct apply_range_data data = { .pages = &page, .i = 0 };
>  	/* Account into memcg of the process that created bpf_arena */
>  	ret = bpf_map_alloc_pages(map, NUMA_NO_NODE, 1, &page);
>  	if (ret) {
>  		range_tree_set(&arena->rt, vmf->pgoff, 1);
> -		goto out_unlock_sigsegv;
> +		goto out_sigsegv_memcg;
>  	}
>  
>  	ret = apply_to_page_range(&init_mm, kaddr, PAGE_SIZE, apply_range_set_cb, &data);
>  	if (ret) {
>  		range_tree_set(&arena->rt, vmf->pgoff, 1);
>  		free_pages_nolock(page, 0);
> -		goto out_unlock_sigsegv;
> +		goto out_sigsegv_memcg;
>  	}
>  	flush_vmap_cache(kaddr, PAGE_SIZE);
>  	bpf_map_memcg_exit(old_memcg, new_memcg);
> @@ -409,8 +460,9 @@ static vm_fault_t arena_vm_fault(struct vm_fault *vmf)
>  	raw_res_spin_unlock_irqrestore(&arena->spinlock, flags);
>  	vmf->page = page;
>  	return 0;
> -out_unlock_sigsegv:
> +out_sigsegv_memcg:
>  	bpf_map_memcg_exit(old_memcg, new_memcg);
> +out_sigsegv:
>  	raw_res_spin_unlock_irqrestore(&arena->spinlock, flags);
>  	return VM_FAULT_SIGSEGV;
>  }
> @@ -668,6 +720,7 @@ static void arena_free_pages(struct bpf_arena *arena, long uaddr, long page_cnt,
>  	struct llist_head free_pages;
>  	struct llist_node *pos, *t;
>  	struct arena_free_span *s;
> +	struct clear_range_data cdata;
>  	unsigned long flags;
>  	int ret = 0;
>  
> @@ -696,9 +749,11 @@ static void arena_free_pages(struct bpf_arena *arena, long uaddr, long page_cnt,
>  	range_tree_set(&arena->rt, pgoff, page_cnt);
>  
>  	init_llist_head(&free_pages);
> +	cdata.free_pages = &free_pages;
> +	cdata.scratch_page = arena->scratch_page;
>  	/* clear ptes and collect struct pages */
>  	apply_to_existing_page_range(&init_mm, kaddr, page_cnt << PAGE_SHIFT,
> -				     apply_range_clear_cb, &free_pages);
> +				     apply_range_clear_cb, &cdata);
>  
>  	/* drop the lock to do the tlb flush and zap pages */
>  	raw_res_spin_unlock_irqrestore(&arena->spinlock, flags);
> @@ -788,6 +843,7 @@ static void arena_free_worker(struct work_struct *work)
>  	struct arena_free_span *s;
>  	u64 arena_vm_start, user_vm_start;
>  	struct llist_head free_pages;
> +	struct clear_range_data cdata;
>  	struct page *page;
>  	unsigned long full_uaddr;
>  	long kaddr, page_cnt, pgoff;
> @@ -801,6 +857,8 @@ static void arena_free_worker(struct work_struct *work)
>  	bpf_map_memcg_enter(&arena->map, &old_memcg, &new_memcg);
>  
>  	init_llist_head(&free_pages);
> +	cdata.free_pages = &free_pages;
> +	cdata.scratch_page = arena->scratch_page;
>  	arena_vm_start = bpf_arena_get_kern_vm_start(arena);
>  	user_vm_start = bpf_arena_get_user_vm_start(arena);
>  
> @@ -813,7 +871,7 @@ static void arena_free_worker(struct work_struct *work)
>  
>  		/* clear ptes and collect pages in free_pages llist */
>  		apply_to_existing_page_range(&init_mm, kaddr, page_cnt << PAGE_SHIFT,
> -					     apply_range_clear_cb, &free_pages);
> +					     apply_range_clear_cb, &cdata);
>  
>  		range_tree_set(&arena->rt, pgoff, page_cnt);
>  	}
> @@ -928,23 +986,12 @@ static int __init kfunc_init(void)
>  }
>  late_initcall(kfunc_init);
>  
> -void bpf_prog_report_arena_violation(bool write, unsigned long addr, unsigned long fault_ip)
> +static void __bpf_prog_report_arena_violation(struct bpf_prog *prog, bool write,
> +					      unsigned long addr, unsigned long fault_ip)
>  {
>  	struct bpf_stream_stage ss;
> -	struct bpf_prog *prog;
>  	u64 user_vm_start;
>  
> -	/*
> -	 * The RCU read lock is held to safely traverse the latch tree, but we
> -	 * don't need its protection when accessing the prog, since it will not
> -	 * disappear while we are handling the fault.
> -	 */
> -	rcu_read_lock();
> -	prog = bpf_prog_ksym_find(fault_ip);
> -	rcu_read_unlock();
> -	if (!prog)
> -		return;
> -
>  	/* Use main prog for stream access */
>  	prog = prog->aux->main_prog_aux->prog;
>  
> @@ -957,3 +1004,53 @@ void bpf_prog_report_arena_violation(bool write, unsigned long addr, unsigned lo
>  		bpf_stream_dump_stack(ss);
>  	}));
>  }
> +
> +bool bpf_arena_handle_page_fault(unsigned long addr, bool is_write, unsigned long fault_ip)
> +{
> +	struct bpf_arena *arena;
> +	struct bpf_prog *prog;
> +	unsigned long kbase;
> +	unsigned long page_addr = addr & PAGE_MASK;
> +
> +	prog = bpf_prog_find_from_stack();
> +	if (!prog)
> +		return false;
> +
> +	arena = prog->aux->arena;
> +	/* a prog not using arena may be on stack, so arena can be NULL */
> +	if (!arena)
> +		return false;
> +
> +	kbase = bpf_arena_get_kern_vm_start(arena);
> +
> +	/*
> +	 * Recovery covers the 4 GiB mappable band plus the upper half-guard.
> +	 * Lower guard is unreachable from kfuncs; an address there indicates
> +	 * a different bug class - leave it to the regular kernel oops path.
> +	 */
> +	if (page_addr < kbase || page_addr >= kbase + SZ_4G + GUARD_SZ / 2)
> +		return false;
> +
> +	apply_to_page_range(&init_mm, page_addr, PAGE_SIZE,
> +			    apply_range_set_scratch_cb, arena->scratch_page);
> +	flush_vmap_cache(page_addr, PAGE_SIZE);
> +	__bpf_prog_report_arena_violation(prog, is_write, page_addr - kbase, fault_ip);
> +	return true;
> +}
> +
> +void bpf_prog_report_arena_violation(bool write, unsigned long addr, unsigned long fault_ip)
> +{
> +	struct bpf_prog *prog;
> +
> +	/*
> +	 * The RCU read lock is held to safely traverse the latch tree, but we
> +	 * don't need its protection when accessing the prog, since it will not
> +	 * disappear while we are handling the fault.
> +	 */
> +	rcu_read_lock();
> +	prog = bpf_prog_ksym_find(fault_ip);
> +	rcu_read_unlock();
> +	if (!prog)
> +		return;
> +	__bpf_prog_report_arena_violation(prog, write, addr, fault_ip);
> +}
> diff --git a/kernel/bpf/core.c b/kernel/bpf/core.c
> index 066b86e7233c..fa368d8920d9 100644
> --- a/kernel/bpf/core.c
> +++ b/kernel/bpf/core.c
> @@ -3290,6 +3290,11 @@ __weak u64 bpf_arena_get_kern_vm_start(struct bpf_arena *arena)
>  {
>  	return 0;
>  }
> +__weak bool bpf_arena_handle_page_fault(unsigned long addr, bool is_write,
> +					unsigned long fault_ip)
> +{
> +	return false;
> +}
>  
>  #ifdef CONFIG_BPF_SYSCALL
>  static int __init bpf_global_ma_init(void)



^ permalink raw reply	[flat|nested] 18+ messages in thread

* Re: [PATCH 3/8] bpf: Add sleepable variant of bpf_arena_alloc_pages for kernel callers
  2026-05-20 23:50 ` [PATCH 3/8] bpf: Add sleepable variant of bpf_arena_alloc_pages for kernel callers Tejun Heo
@ 2026-05-21  3:17   ` Emil Tsalapatis
  0 siblings, 0 replies; 18+ messages in thread
From: Emil Tsalapatis @ 2026-05-21  3:17 UTC (permalink / raw)
  To: Tejun Heo, David Vernet, Andrea Righi, Changwoo Min,
	Alexei Starovoitov, Andrii Nakryiko, Daniel Borkmann,
	Martin KaFai Lau, Kumar Kartikeya Dwivedi
  Cc: Peter Zijlstra, Catalin Marinas, Will Deacon, Thomas Gleixner,
	Ingo Molnar, Borislav Petkov, Dave Hansen, Andrew Morton,
	David Hildenbrand, Mike Rapoport, Emil Tsalapatis, sched-ext, bpf,
	x86, linux-arm-kernel, linux-mm, linux-kernel

On Wed May 20, 2026 at 7:50 PM EDT, Tejun Heo wrote:
> The existing kernel-side export of bpf_arena_alloc_pages is _non_sleepable
> only - it's used by the verifier to inline the kfunc when the call site is
> non-sleepable. There is no sleepable equivalent for kernel callers; the
> kfunc bpf_arena_alloc_pages itself is BPF-only.
>
> sched_ext needs sleepable kernel-side allocs for its arena pool init/grow
> paths. Add bpf_arena_alloc_pages_sleepable() mirroring the _non_sleepable
> wrapper but passing sleepable=true to arena_alloc_pages().
>
> Signed-off-by: Tejun Heo <tj@kernel.org>

Reviewed-by: Emil Tsalapatis <emil@etsalapatis.com>

> ---
>  include/linux/bpf.h |  8 ++++++++
>  kernel/bpf/arena.c  | 13 +++++++++++++
>  2 files changed, 21 insertions(+)
>
> diff --git a/include/linux/bpf.h b/include/linux/bpf.h
> index 831996c411cf..64968ca6db51 100644
> --- a/include/linux/bpf.h
> +++ b/include/linux/bpf.h
> @@ -679,6 +679,8 @@ int bpf_dynptr_from_file_sleepable(struct file *file, u32 flags,
>  void *bpf_arena_alloc_pages_non_sleepable(void *p__map, void *addr__ign, u32 page_cnt, int node_id,
>  					  u64 flags);
>  void bpf_arena_free_pages_non_sleepable(void *p__map, void *ptr__ign, u32 page_cnt);
> +void *bpf_arena_alloc_pages_sleepable(void *p__map, void *addr__ign, u32 page_cnt, int node_id,
> +				      u64 flags);
>  #else
>  static inline void *bpf_arena_alloc_pages_non_sleepable(void *p__map, void *addr__ign, u32 page_cnt,
>  							int node_id, u64 flags)
> @@ -689,6 +691,12 @@ static inline void *bpf_arena_alloc_pages_non_sleepable(void *p__map, void *addr
>  static inline void bpf_arena_free_pages_non_sleepable(void *p__map, void *ptr__ign, u32 page_cnt)
>  {
>  }
> +
> +static inline void *bpf_arena_alloc_pages_sleepable(void *p__map, void *addr__ign, u32 page_cnt,
> +						    int node_id, u64 flags)
> +{
> +	return NULL;
> +}
>  #endif
>  
>  extern const struct bpf_map_ops bpf_map_offload_ops;
> diff --git a/kernel/bpf/arena.c b/kernel/bpf/arena.c
> index 1c0b87ecc817..a811cf6170fa 100644
> --- a/kernel/bpf/arena.c
> +++ b/kernel/bpf/arena.c
> @@ -934,6 +934,19 @@ void *bpf_arena_alloc_pages_non_sleepable(void *p__map, void *addr__ign, u32 pag
>  
>  	return (void *)arena_alloc_pages(arena, (long)addr__ign, page_cnt, node_id, false);
>  }
> +
> +void *bpf_arena_alloc_pages_sleepable(void *p__map, void *addr__ign, u32 page_cnt,
> +				      int node_id, u64 flags)
> +{
> +	struct bpf_map *map = p__map;
> +	struct bpf_arena *arena = container_of(map, struct bpf_arena, map);
> +
> +	if (map->map_type != BPF_MAP_TYPE_ARENA || flags || !page_cnt)
> +		return NULL;
> +
> +	return (void *)arena_alloc_pages(arena, (long)addr__ign, page_cnt, node_id, true);
> +}
> +
>  __bpf_kfunc void bpf_arena_free_pages(void *p__map, void *ptr__ign, u32 page_cnt)
>  {
>  	struct bpf_map *map = p__map;



^ permalink raw reply	[flat|nested] 18+ messages in thread

* Re: [PATCH 4/8] bpf: Add bpf_struct_ops_for_each_prog()
  2026-05-20 23:50 ` [PATCH 4/8] bpf: Add bpf_struct_ops_for_each_prog() Tejun Heo
@ 2026-05-21  4:07   ` Emil Tsalapatis
  0 siblings, 0 replies; 18+ messages in thread
From: Emil Tsalapatis @ 2026-05-21  4:07 UTC (permalink / raw)
  To: Tejun Heo, David Vernet, Andrea Righi, Changwoo Min,
	Alexei Starovoitov, Andrii Nakryiko, Daniel Borkmann,
	Martin KaFai Lau, Kumar Kartikeya Dwivedi
  Cc: Peter Zijlstra, Catalin Marinas, Will Deacon, Thomas Gleixner,
	Ingo Molnar, Borislav Petkov, Dave Hansen, Andrew Morton,
	David Hildenbrand, Mike Rapoport, Emil Tsalapatis, sched-ext, bpf,
	x86, linux-arm-kernel, linux-mm, linux-kernel

On Wed May 20, 2026 at 7:50 PM EDT, Tejun Heo wrote:
> Add a helper that walks the member progs of the struct_ops map
> containing a given @kdata vmtable. struct_ops ->reg() callbacks (and
> similar) sometimes need to inspect the loaded BPF programs, e.g. to
> discover maps they reference via prog->aux->used_maps.
>
> The implementation mirrors bpf_struct_ops_id(): container_of @kdata
> to recover the bpf_struct_ops_map, then iterate st_map->links[i]->prog
> for i in [0, funcs_cnt). Same access pattern, no new locking - by the
> time ->reg() fires st_map is fully populated and stable.
>
> A sched_ext follow-up walks the member progs of a cid-form scheduler's
> struct_ops map, reads prog->aux->arena directly, and requires all member
> progs to reference exactly one arena, without requiring the BPF program
> to call a registration kfunc.
>
> Signed-off-by: Tejun Heo <tj@kernel.org>

Reviewed-by: Emil Tsalapatis <emil@etsalapatis.com>

> ---
>  include/linux/bpf.h         |  3 +++
>  kernel/bpf/bpf_struct_ops.c | 36 ++++++++++++++++++++++++++++++++++++
>  2 files changed, 39 insertions(+)
>
> diff --git a/include/linux/bpf.h b/include/linux/bpf.h
> index 64968ca6db51..5b99d786e98c 100644
> --- a/include/linux/bpf.h
> +++ b/include/linux/bpf.h
> @@ -2129,6 +2129,9 @@ int bpf_prog_assoc_struct_ops(struct bpf_prog *prog, struct bpf_map *map);
>  void bpf_prog_disassoc_struct_ops(struct bpf_prog *prog);
>  void *bpf_prog_get_assoc_struct_ops(const struct bpf_prog_aux *aux);
>  u32 bpf_struct_ops_id(const void *kdata);
> +int bpf_struct_ops_for_each_prog(const void *kdata,
> +				 int (*cb)(struct bpf_prog *prog, void *data),
> +				 void *data);
>  
>  #ifdef CONFIG_NET
>  /* Define it here to avoid the use of forward declaration */
> diff --git a/kernel/bpf/bpf_struct_ops.c b/kernel/bpf/bpf_struct_ops.c
> index 05b366b821c3..16aec18ed31b 100644
> --- a/kernel/bpf/bpf_struct_ops.c
> +++ b/kernel/bpf/bpf_struct_ops.c
> @@ -1203,6 +1203,42 @@ u32 bpf_struct_ops_id(const void *kdata)
>  }
>  EXPORT_SYMBOL_GPL(bpf_struct_ops_id);
>  
> +/**
> + * bpf_struct_ops_for_each_prog - Invoke @cb for each member prog
> + * @kdata: kernel-side struct_ops vmtable (the @kdata arg to ->reg/->update/->unreg)
> + * @cb: callback invoked once per member prog; non-zero return stops iteration
> + * @data: opaque argument passed to @cb
> + *
> + * Walks the struct_ops member progs registered on the map containing @kdata.
> + * Intended for use from struct_ops ->reg() callbacks (and similar) that need to
> + * inspect the loaded BPF programs (for example to discover maps they reference
> + * via @prog->aux->used_maps).
> + *
> + * Return 0 if iteration completed, otherwise the first non-zero @cb return.
> + */
> +int bpf_struct_ops_for_each_prog(const void *kdata,
> +				 int (*cb)(struct bpf_prog *prog, void *data),
> +				 void *data)
> +{
> +	struct bpf_struct_ops_value *kvalue;
> +	struct bpf_struct_ops_map *st_map;
> +	u32 i;
> +	int ret;
> +
> +	kvalue = container_of(kdata, struct bpf_struct_ops_value, data);
> +	st_map = container_of(kvalue, struct bpf_struct_ops_map, kvalue);
> +
> +	for (i = 0; i < st_map->funcs_cnt; i++) {
> +		if (!st_map->links[i])
> +			continue;
> +		ret = cb(st_map->links[i]->prog, data);
> +		if (ret)
> +			return ret;
> +	}
> +	return 0;
> +}
> +EXPORT_SYMBOL_GPL(bpf_struct_ops_for_each_prog);
> +
>  static bool bpf_struct_ops_valid_to_reg(struct bpf_map *map)
>  {
>  	struct bpf_struct_ops_map *st_map = (struct bpf_struct_ops_map *)map;



^ permalink raw reply	[flat|nested] 18+ messages in thread

* Re: [PATCH 5/8] bpf/arena: Add bpf_arena_map_kern_vm_start() and bpf_prog_arena()
  2026-05-20 23:50 ` [PATCH 5/8] bpf/arena: Add bpf_arena_map_kern_vm_start() and bpf_prog_arena() Tejun Heo
@ 2026-05-21  4:08   ` Emil Tsalapatis
  0 siblings, 0 replies; 18+ messages in thread
From: Emil Tsalapatis @ 2026-05-21  4:08 UTC (permalink / raw)
  To: Tejun Heo, David Vernet, Andrea Righi, Changwoo Min,
	Alexei Starovoitov, Andrii Nakryiko, Daniel Borkmann,
	Martin KaFai Lau, Kumar Kartikeya Dwivedi
  Cc: Peter Zijlstra, Catalin Marinas, Will Deacon, Thomas Gleixner,
	Ingo Molnar, Borislav Petkov, Dave Hansen, Andrew Morton,
	David Hildenbrand, Mike Rapoport, Emil Tsalapatis, sched-ext, bpf,
	x86, linux-arm-kernel, linux-mm, linux-kernel

On Wed May 20, 2026 at 7:50 PM EDT, Tejun Heo wrote:
> struct bpf_arena is opaque to callers outside arena.c. Add two helpers
> for struct_ops subsystems that need to reach into an arena:
>
>   bpf_arena_map_kern_vm_start(struct bpf_map *map)
>     returns @map's kern_vm_start. A sched_ext follow-up needs this
>     to translate kern_va <-> uaddr.
>
>   bpf_prog_arena(struct bpf_prog *prog)
>     returns the bpf_map of the arena referenced by @prog (NULL if
>     @prog references no arena). The verifier enforces at most one
>     arena per program. Used by struct_ops callers that auto-discover
>     an arena from a member prog and need to take a map reference.
>
> Suggested-by: Kumar Kartikeya Dwivedi <memxor@gmail.com>
> Signed-off-by: Tejun Heo <tj@kernel.org>

Reviewed-by: Emil Tsalapatis <emil@etsalapatis.com>

> ---
>  include/linux/bpf.h |  2 ++
>  kernel/bpf/arena.c  | 26 ++++++++++++++++++++++++++
>  2 files changed, 28 insertions(+)
>
> diff --git a/include/linux/bpf.h b/include/linux/bpf.h
> index 5b99d786e98c..e1ba57c10aaa 100644
> --- a/include/linux/bpf.h
> +++ b/include/linux/bpf.h
> @@ -618,6 +618,8 @@ void bpf_rb_root_free(const struct btf_field *field, void *rb_root,
>  		      struct bpf_spin_lock *spin_lock);
>  u64 bpf_arena_get_kern_vm_start(struct bpf_arena *arena);
>  u64 bpf_arena_get_user_vm_start(struct bpf_arena *arena);
> +u64 bpf_arena_map_kern_vm_start(struct bpf_map *map);
> +struct bpf_map *bpf_prog_arena(struct bpf_prog *prog);
>  int bpf_obj_name_cpy(char *dst, const char *src, unsigned int size);
>  
>  struct bpf_offload_dev;
> diff --git a/kernel/bpf/arena.c b/kernel/bpf/arena.c
> index a811cf6170fa..51b9ae36feb6 100644
> --- a/kernel/bpf/arena.c
> +++ b/kernel/bpf/arena.c
> @@ -84,6 +84,32 @@ u64 bpf_arena_get_user_vm_start(struct bpf_arena *arena)
>  	return arena ? arena->user_vm_start : 0;
>  }
>  
> +/**
> + * bpf_arena_map_kern_vm_start - kern_vm_start lookup by struct bpf_map *
> + * @map: a BPF_MAP_TYPE_ARENA map
> + *
> + * Return @map's kern_vm_start.
> + */
> +u64 bpf_arena_map_kern_vm_start(struct bpf_map *map)
> +{
> +	return bpf_arena_get_kern_vm_start(container_of(map, struct bpf_arena, map));
> +}
> +
> +/**
> + * bpf_prog_arena - return the bpf_map of the arena referenced by @prog
> + * @prog: a loaded BPF program
> + *
> + * The verifier enforces at most one arena per program and stores it in
> + * prog->aux->arena. Return that arena's underlying bpf_map, or NULL if
> + * @prog does not reference an arena.
> + */
> +struct bpf_map *bpf_prog_arena(struct bpf_prog *prog)
> +{
> +	struct bpf_arena *arena = prog->aux->arena;
> +
> +	return arena ? &arena->map : NULL;
> +}
> +
>  static long arena_map_peek_elem(struct bpf_map *map, void *value)
>  {
>  	return -EOPNOTSUPP;



^ permalink raw reply	[flat|nested] 18+ messages in thread

* Re: [PATCH 6/8] sched_ext: Require an arena for cid-form schedulers
  2026-05-20 23:50 ` [PATCH 6/8] sched_ext: Require an arena for cid-form schedulers Tejun Heo
@ 2026-05-21  4:15   ` Emil Tsalapatis
  0 siblings, 0 replies; 18+ messages in thread
From: Emil Tsalapatis @ 2026-05-21  4:15 UTC (permalink / raw)
  To: Tejun Heo, David Vernet, Andrea Righi, Changwoo Min,
	Alexei Starovoitov, Andrii Nakryiko, Daniel Borkmann,
	Martin KaFai Lau, Kumar Kartikeya Dwivedi
  Cc: Peter Zijlstra, Catalin Marinas, Will Deacon, Thomas Gleixner,
	Ingo Molnar, Borislav Petkov, Dave Hansen, Andrew Morton,
	David Hildenbrand, Mike Rapoport, Emil Tsalapatis, sched-ext, bpf,
	x86, linux-arm-kernel, linux-mm, linux-kernel

On Wed May 20, 2026 at 7:50 PM EDT, Tejun Heo wrote:
> Upcoming patches will let the kernel place arena-resident scratch shared
> with the BPF program (e.g. per-CPU set_cmask cmask) so the BPF side can
> dereference it directly via __arena pointers, replacing the current
> cmask_copy_from_kernel() probe-read loop. That requires each cid-form
> scheduler to expose its arena to the kernel. Kernel- side accesses are
> recovered by the per-arena scratch-page mechanism.
>
> bpf_scx_reg_cid() walks the struct_ops member progs via
> bpf_struct_ops_for_each_prog() and reads each prog's arena via
> bpf_prog_arena(). The verifier enforces one arena per program, so each
> member prog contributes at most one arena. All non-NULL contributions must
> match and at least one member prog must use an arena. The map ref is held on
> scx_sched and dropped on sched destroy. cpu-form schedulers (bpf_scx_reg)
> are unchanged - no arena requirement.
>
> Signed-off-by: Tejun Heo <tj@kernel.org>
> ---
>  kernel/sched/ext.c          | 56 ++++++++++++++++++++++++++++++++++++-
>  kernel/sched/ext_internal.h |  8 ++++++
>  2 files changed, 63 insertions(+), 1 deletion(-)
>
> diff --git a/kernel/sched/ext.c b/kernel/sched/ext.c
> index 9c458552d14f..56f94ac32ba0 100644
> --- a/kernel/sched/ext.c
> +++ b/kernel/sched/ext.c
> @@ -5003,6 +5003,8 @@ static void scx_sched_free_rcu_work(struct work_struct *work)
>  
>  	rhashtable_free_and_destroy(&sch->dsq_hash, NULL, NULL);
>  	free_exit_info(sch->exit_info);
> +	if (sch->arena_map)
> +		bpf_map_put(sch->arena_map);
>  	kfree(sch);
>  }
>  
> @@ -6746,6 +6748,7 @@ struct scx_enable_cmd {
>  		struct sched_ext_ops_cid	*ops_cid;
>  	};
>  	bool			is_cid_type;
> +	struct bpf_map		*arena_map;	/* arena ref to transfer to sch */
>  	int			ret;
>  };
>  
> @@ -6913,6 +6916,15 @@ static struct scx_sched *scx_alloc_and_add_sched(struct scx_enable_cmd *cmd,
>  		return ERR_PTR(ret);
>  	}
>  #endif	/* CONFIG_EXT_SUB_SCHED */
> +
> +	/*
> +	 * Consume the arena_map ref bpf_scx_reg_cid() took. Defer to here so
> +	 * earlier failure paths leave cmd->arena_map set and bpf_scx_reg_cid
> +	 * drops the ref. After this point, sch owns the ref and any cleanup
> +	 * runs through scx_sched_free_rcu_work() which puts it.
> +	 */
> +	sch->arena_map = cmd->arena_map;
> +	cmd->arena_map = NULL;
>  	return sch;
>  
>  #ifdef CONFIG_EXT_SUB_SCHED
> @@ -7898,11 +7910,53 @@ static int bpf_scx_reg(void *kdata, struct bpf_link *link)
>  	return scx_enable(&cmd, link);
>  }
>  
> +struct scx_arena_scan {
> +	struct bpf_map	*arena;
> +	int		err;

Can we skip the int err here...

> +};
> +
> +/*
> + * The verifier enforces one arena per BPF program, so each struct_ops
> + * member prog contributes at most one arena via bpf_prog_arena().
> + * Require all non-NULL contributions to match.
> + */
> +static int scx_arena_scan_prog(struct bpf_prog *prog, void *data)
> +{
> +	struct scx_arena_scan *s = data;
> +	struct bpf_map *arena = bpf_prog_arena(prog);
> +
> +	if (!arena)
> +		return 0;
> +	if (s->arena && s->arena != arena) {
> +		s->err = -EINVAL;

...and just directly return -EINVAL here? bpf_struct_ops_for_each_prog
breaks when we return non-zero so do we need the extra scx_arena_scan
struct?

> +		return 1;
> +	}
> +	s->arena = arena;
> +	return 0;
> +}
> +
>  static int bpf_scx_reg_cid(void *kdata, struct bpf_link *link)
>  {
>  	struct scx_enable_cmd cmd = { .ops_cid = kdata, .is_cid_type = true };
> +	struct scx_arena_scan scan = {};
> +	int ret;
>  
> -	return scx_enable(&cmd, link);
> +	bpf_struct_ops_for_each_prog(kdata, scx_arena_scan_prog, &scan);
> +	if (scan.err) {
> +		pr_err("sched_ext: cid-form scheduler uses multiple arena maps\n");
> +		return scan.err;
> +	}
> +	if (!scan.arena) {
> +		pr_err("sched_ext: cid-form scheduler must use a BPF arena map\n");
> +		return -EINVAL;
> +	}
> +
> +	bpf_map_inc(scan.arena);
> +	cmd.arena_map = scan.arena;
> +	ret = scx_enable(&cmd, link);
> +	if (cmd.arena_map)		/* not consumed by scx_alloc_and_add_sched() */
> +		bpf_map_put(cmd.arena_map);
> +	return ret;
>  }
>  
>  static void bpf_scx_unreg(void *kdata, struct bpf_link *link)
> diff --git a/kernel/sched/ext_internal.h b/kernel/sched/ext_internal.h
> index 7258aea94b9f..d40cfd29ddaa 100644
> --- a/kernel/sched/ext_internal.h
> +++ b/kernel/sched/ext_internal.h
> @@ -1111,6 +1111,14 @@ struct scx_sched {
>  		struct sched_ext_ops_cid	ops_cid;
>  	};
>  	bool			is_cid_type;	/* true if registered via bpf_sched_ext_ops_cid */
> +
> +	/*
> +	 * Arena map auto-discovered from member progs at struct_ops attach.
> +	 * cid-form schedulers must use exactly one arena across all member
> +	 * progs. NULL on cpu-form.
> +	 */
> +	struct bpf_map		*arena_map;
> +
>  	DECLARE_BITMAP(has_op, SCX_OPI_END);
>  
>  	/*



^ permalink raw reply	[flat|nested] 18+ messages in thread

* Re: [PATCH 8/8] sched_ext: Convert ops.set_cmask() to arena-resident cmask
  2026-05-20 23:50 ` [PATCH 8/8] sched_ext: Convert ops.set_cmask() to arena-resident cmask Tejun Heo
@ 2026-05-21  4:19   ` Emil Tsalapatis
  0 siblings, 0 replies; 18+ messages in thread
From: Emil Tsalapatis @ 2026-05-21  4:19 UTC (permalink / raw)
  To: Tejun Heo, David Vernet, Andrea Righi, Changwoo Min,
	Alexei Starovoitov, Andrii Nakryiko, Daniel Borkmann,
	Martin KaFai Lau, Kumar Kartikeya Dwivedi
  Cc: Peter Zijlstra, Catalin Marinas, Will Deacon, Thomas Gleixner,
	Ingo Molnar, Borislav Petkov, Dave Hansen, Andrew Morton,
	David Hildenbrand, Mike Rapoport, Emil Tsalapatis, sched-ext, bpf,
	x86, linux-arm-kernel, linux-mm, linux-kernel

On Wed May 20, 2026 at 7:50 PM EDT, Tejun Heo wrote:
> ops_cid.set_cmask() expects a cmask. The kernel couldn't write into the
> arena, so it translated cpumask -> cmask in kernel memory and passed the
> result as a trusted pointer. The BPF cmask helpers all operate on arena
> cmasks though, so the BPF side had to word-by-word probe-read the kernel
> cmask into an arena cmask via cmask_copy_from_kernel() before any helper
> could touch it. It works, but is clumsy.
>
> With direct kernel-side arena access now in place, build the cmask in the
> arena. The kernel writes to it through the kern_va side of the dual mapping;
> BPF directly dereferences it via an __arena pointer like any other arena
> struct.
>
> Signed-off-by: Tejun Heo <tj@kernel.org>

Reviewed-by: Emil Tsalapatis <emil@etsalapatis.com>

> ---
>  kernel/sched/ext.c                    | 68 +++++++++++++++++++++++++--
>  kernel/sched/ext_cid.c                | 20 +-------
>  kernel/sched/ext_internal.h           | 10 +++-
>  tools/sched_ext/include/scx/cid.bpf.h | 52 --------------------
>  tools/sched_ext/scx_qmap.bpf.c        |  5 +-
>  5 files changed, 75 insertions(+), 80 deletions(-)
>
> diff --git a/kernel/sched/ext.c b/kernel/sched/ext.c
> index fb91079c1244..94562e3350c6 100644
> --- a/kernel/sched/ext.c
> +++ b/kernel/sched/ext.c
> @@ -621,11 +621,16 @@ static inline void scx_call_op_set_cpumask(struct scx_sched *sch, struct rq *rq,
>  		update_locked_rq(rq);
>  
>  	if (scx_is_cid_type()) {
> -		struct scx_cmask *cmask = this_cpu_ptr(scx_set_cmask_scratch);
> -
> -		lockdep_assert_irqs_disabled();
> -		scx_cpumask_to_cmask(cpumask, cmask);
> -		sch->ops_cid.set_cmask(task, cmask);
> +		struct scx_cmask *kern_va = *this_cpu_ptr(sch->set_cmask_scratch);
> +		unsigned long uaddr = (unsigned long)kern_va -
> +			bpf_arena_map_kern_vm_start(sch->arena_map);
> +		/*
> +		 * Build the per-CPU arena cmask and hand BPF the uaddr. Caller
> +		 * holds the rq lock with IRQs disabled, which makes us the sole
> +		 * user of the scratch area.
> +		 */
> +		scx_cpumask_to_cmask(cpumask, kern_va);
> +		sch->ops_cid.set_cmask(task, (struct scx_cmask *)uaddr);
>  	} else {
>  		sch->ops.set_cpumask(task, cpumask);
>  	}
> @@ -4949,6 +4954,48 @@ static const struct attribute_group scx_global_attr_group = {
>  static void free_pnode(struct scx_sched_pnode *pnode);
>  static void free_exit_info(struct scx_exit_info *ei);
>  
> +static s32 scx_set_cmask_scratch_alloc(struct scx_sched *sch)
> +{
> +	size_t size = struct_size_t(struct scx_cmask, bits,
> +				    SCX_CMASK_NR_WORDS(num_possible_cpus()));
> +	int cpu;
> +
> +	if (!sch->is_cid_type || !sch->arena_pool)
> +		return 0;
> +
> +	sch->set_cmask_scratch = alloc_percpu(struct scx_cmask *);
> +	if (!sch->set_cmask_scratch)
> +		return -ENOMEM;
> +
> +	for_each_possible_cpu(cpu) {
> +		struct scx_cmask **slot = per_cpu_ptr(sch->set_cmask_scratch, cpu);
> +
> +		*slot = scx_arena_alloc(sch, size);
> +		if (!*slot)
> +			return -ENOMEM;
> +		scx_cmask_init(*slot, 0, num_possible_cpus());
> +	}
> +	return 0;
> +}
> +
> +static void scx_set_cmask_scratch_free(struct scx_sched *sch)
> +{
> +	size_t size = struct_size_t(struct scx_cmask, bits,
> +				    SCX_CMASK_NR_WORDS(num_possible_cpus()));
> +	int cpu;
> +
> +	if (!sch->set_cmask_scratch)
> +		return;
> +
> +	for_each_possible_cpu(cpu) {
> +		struct scx_cmask **slot = per_cpu_ptr(sch->set_cmask_scratch, cpu);
> +
> +		scx_arena_free(sch, *slot, size);
> +	}
> +	free_percpu(sch->set_cmask_scratch);
> +	sch->set_cmask_scratch = NULL;
> +}
> +
>  static void scx_sched_free_rcu_work(struct work_struct *work)
>  {
>  	struct rcu_work *rcu_work = to_rcu_work(work);
> @@ -5003,6 +5050,7 @@ static void scx_sched_free_rcu_work(struct work_struct *work)
>  
>  	rhashtable_free_and_destroy(&sch->dsq_hash, NULL, NULL);
>  	free_exit_info(sch->exit_info);
> +	scx_set_cmask_scratch_free(sch);
>  	scx_arena_pool_destroy(sch);
>  	if (sch->arena_map)
>  		bpf_map_put(sch->arena_map);
> @@ -7162,6 +7210,12 @@ static void scx_root_enable_workfn(struct kthread_work *work)
>  		goto err_disable;
>  	}
>  
> +	ret = scx_set_cmask_scratch_alloc(sch);
> +	if (ret) {
> +		cpus_read_unlock();
> +		goto err_disable;
> +	}
> +
>  	for (i = SCX_OPI_CPU_HOTPLUG_BEGIN; i < SCX_OPI_CPU_HOTPLUG_END; i++)
>  		if (((void (**)(void))ops)[i])
>  			set_bit(i, sch->has_op);
> @@ -7484,6 +7538,10 @@ static void scx_sub_enable_workfn(struct kthread_work *work)
>  	if (ret)
>  		goto err_disable;
>  
> +	ret = scx_set_cmask_scratch_alloc(sch);
> +	if (ret)
> +		goto err_disable;
> +
>  	if (validate_ops(sch, ops))
>  		goto err_disable;
>  
> diff --git a/kernel/sched/ext_cid.c b/kernel/sched/ext_cid.c
> index 0c91b951fd33..808c6390da5a 100644
> --- a/kernel/sched/ext_cid.c
> +++ b/kernel/sched/ext_cid.c
> @@ -7,14 +7,6 @@
>   */
>  #include <linux/cacheinfo.h>
>  
> -/*
> - * Per-cpu scratch cmask used by scx_call_op_set_cpumask() to synthesize a
> - * cmask from a cpumask. Allocated alongside the cid arrays on first enable
> - * and never freed. Sized to the full cid space. Caller holds rq lock so
> - * this_cpu_ptr is safe.
> - */
> -struct scx_cmask __percpu *scx_set_cmask_scratch;
> -
>  /*
>   * cid tables.
>   *
> @@ -54,8 +46,6 @@ static s32 scx_cid_arrays_alloc(void)
>  	u32 npossible = num_possible_cpus();
>  	s16 *cid_to_cpu, *cpu_to_cid;
>  	struct scx_cid_topo *cid_topo;
> -	struct scx_cmask __percpu *set_cmask_scratch;
> -	s32 cpu;
>  
>  	if (scx_cid_to_cpu_tbl)
>  		return 0;
> @@ -63,25 +53,17 @@ static s32 scx_cid_arrays_alloc(void)
>  	cid_to_cpu = kzalloc_objs(*scx_cid_to_cpu_tbl, npossible, GFP_KERNEL);
>  	cpu_to_cid = kzalloc_objs(*scx_cpu_to_cid_tbl, nr_cpu_ids, GFP_KERNEL);
>  	cid_topo = kmalloc_objs(*scx_cid_topo, npossible, GFP_KERNEL);
> -	set_cmask_scratch = __alloc_percpu(struct_size(set_cmask_scratch, bits,
> -						       SCX_CMASK_NR_WORDS(npossible)),
> -					   sizeof(u64));
>  
> -	if (!cid_to_cpu || !cpu_to_cid || !cid_topo || !set_cmask_scratch) {
> +	if (!cid_to_cpu || !cpu_to_cid || !cid_topo) {
>  		kfree(cid_to_cpu);
>  		kfree(cpu_to_cid);
>  		kfree(cid_topo);
> -		free_percpu(set_cmask_scratch);
>  		return -ENOMEM;
>  	}
>  
>  	WRITE_ONCE(scx_cid_to_cpu_tbl, cid_to_cpu);
>  	WRITE_ONCE(scx_cpu_to_cid_tbl, cpu_to_cid);
>  	WRITE_ONCE(scx_cid_topo, cid_topo);
> -	for_each_possible_cpu(cpu)
> -		scx_cmask_init(per_cpu_ptr(set_cmask_scratch, cpu),
> -			       0, npossible);
> -	WRITE_ONCE(scx_set_cmask_scratch, set_cmask_scratch);
>  	return 0;
>  }
>  
> diff --git a/kernel/sched/ext_internal.h b/kernel/sched/ext_internal.h
> index ff7e882bd67a..9bb65367f510 100644
> --- a/kernel/sched/ext_internal.h
> +++ b/kernel/sched/ext_internal.h
> @@ -1124,6 +1124,14 @@ struct scx_sched {
>  	struct bpf_map		*arena_map;
>  	struct gen_pool		*arena_pool;
>  
> +	/*
> +	 * Per-CPU arena cmask used by scx_call_op_set_cpumask() to hand a cmask
> +	 * to ops_cid.set_cmask(). The kernel writes through the stored kern_va;
> +	 * the BPF-arena uaddr handed to BPF is recovered by subtracting the
> +	 * arena's kern_vm_start.
> +	 */
> +	struct scx_cmask * __percpu *set_cmask_scratch;
> +
>  	DECLARE_BITMAP(has_op, SCX_OPI_END);
>  
>  	/*
> @@ -1480,8 +1488,6 @@ enum scx_ops_state {
>  extern struct scx_sched __rcu *scx_root;
>  DECLARE_PER_CPU(struct rq *, scx_locked_rq_state);
>  
> -extern struct scx_cmask __percpu *scx_set_cmask_scratch;
> -
>  /*
>   * True when the currently loaded scheduler hierarchy is cid-form. All scheds
>   * in a hierarchy share one form, so this single key tells callsites which
> diff --git a/tools/sched_ext/include/scx/cid.bpf.h b/tools/sched_ext/include/scx/cid.bpf.h
> index e281c88fa824..70f2a3829af4 100644
> --- a/tools/sched_ext/include/scx/cid.bpf.h
> +++ b/tools/sched_ext/include/scx/cid.bpf.h
> @@ -675,56 +675,4 @@ static __always_inline void cmask_from_cpumask(struct scx_cmask __arena *m,
>  	}
>  }
>  
> -/**
> - * cmask_copy_from_kernel - probe-read a kernel cmask into an arena cmask
> - * @dst: arena cmask to fill; must have @dst->base == 0 and be sized for @src.
> - * @src: kernel-memory cmask (e.g. ops.set_cmask() arg); @src->base must be 0.
> - *
> - * Word-for-word copy; @src and @dst must share base 0 alignment. Triggers
> - * scx_bpf_error() on probe failure or precondition violation.
> - */
> -static __always_inline void cmask_copy_from_kernel(struct scx_cmask __arena *dst,
> -						   const struct scx_cmask *src)
> -{
> -	u32 base = 0, nr_cids = 0, nr_words, wi;
> -
> -	if (dst->base != 0) {
> -		scx_bpf_error("cmask_copy_from_kernel requires dst->base == 0");
> -		return;
> -	}
> -
> -	if (bpf_probe_read_kernel(&base, sizeof(base), &src->base)) {
> -		scx_bpf_error("probe-read cmask->base failed");
> -		return;
> -	}
> -	if (base != 0) {
> -		scx_bpf_error("cmask_copy_from_kernel requires src->base == 0");
> -		return;
> -	}
> -
> -	if (bpf_probe_read_kernel(&nr_cids, sizeof(nr_cids), &src->nr_cids)) {
> -		scx_bpf_error("probe-read cmask->nr_cids failed");
> -		return;
> -	}
> -
> -	if (nr_cids > dst->nr_cids) {
> -		scx_bpf_error("src cmask nr_cids=%u exceeds dst nr_cids=%u",
> -			      nr_cids, dst->nr_cids);
> -		return;
> -	}
> -
> -	nr_words = CMASK_NR_WORDS(nr_cids);
> -	cmask_zero(dst);
> -	bpf_for(wi, 0, CMASK_MAX_WORDS) {
> -		u64 word = 0;
> -		if (wi >= nr_words)
> -			break;
> -		if (bpf_probe_read_kernel(&word, sizeof(u64), &src->bits[wi])) {
> -			scx_bpf_error("probe-read cmask->bits[%u] failed", wi);
> -			return;
> -		}
> -		dst->bits[wi] = word;
> -	}
> -}
> -
>  #endif /* __SCX_CID_BPF_H */
> diff --git a/tools/sched_ext/scx_qmap.bpf.c b/tools/sched_ext/scx_qmap.bpf.c
> index 7e77f22674ea..8a2d6a8ebd8e 100644
> --- a/tools/sched_ext/scx_qmap.bpf.c
> +++ b/tools/sched_ext/scx_qmap.bpf.c
> @@ -919,14 +919,15 @@ void BPF_STRUCT_OPS(qmap_update_idle, s32 cid, bool idle)
>  }
>  
>  void BPF_STRUCT_OPS(qmap_set_cmask, struct task_struct *p,
> -		    const struct scx_cmask *cmask)
> +		    const struct scx_cmask *cmask_in)
>  {
> +	struct scx_cmask __arena *cmask = (struct scx_cmask __arena *)(long)cmask_in;
>  	task_ctx_t *taskc;
>  
>  	taskc = lookup_task_ctx(p);
>  	if (!taskc)
>  		return;
> -	cmask_copy_from_kernel(&taskc->cpus_allowed, cmask);
> +	cmask_copy(&taskc->cpus_allowed, cmask);
>  }
>  
>  struct monitor_timer {



^ permalink raw reply	[flat|nested] 18+ messages in thread

* Re: [PATCH 1/8] mm: Add ptep_try_set() for lockless empty-slot installs
  2026-05-20 23:50 ` [PATCH 1/8] mm: Add ptep_try_set() for lockless empty-slot installs Tejun Heo
@ 2026-05-21  7:00   ` Andrea Righi
  0 siblings, 0 replies; 18+ messages in thread
From: Andrea Righi @ 2026-05-21  7:00 UTC (permalink / raw)
  To: Tejun Heo
  Cc: David Vernet, Changwoo Min, Alexei Starovoitov, Andrii Nakryiko,
	Daniel Borkmann, Martin KaFai Lau, Kumar Kartikeya Dwivedi,
	Peter Zijlstra, Catalin Marinas, Will Deacon, Thomas Gleixner,
	Ingo Molnar, Borislav Petkov, Dave Hansen, Andrew Morton,
	David Hildenbrand, Mike Rapoport, Emil Tsalapatis, sched-ext, bpf,
	x86, linux-arm-kernel, linux-mm, linux-kernel

Hi Tejun,

On Wed, May 20, 2026 at 01:50:45PM -1000, Tejun Heo wrote:
> Add ptep_try_set(ptep, new_pte): atomically set *ptep to new_pte iff it is
> currently pte_none(). Returns true on success, false if the slot was already
> populated or the arch has no implementation.
> 
> The intended caller is the upcoming bpf_arena kernel-side fault recovery
> path. The install runs from a page fault that can be nested under locks
> held by the faulting kernel caller (e.g. a BPF program holding
> raw_res_spin_lock_irqsave on its arena's spinlock), so trylock-and-retry
> would A-A deadlock. Lock-free cmpxchg is the only viable option, which
> constrains this helper to special kernel page tables where concurrent
> writers cooperate via atomic accessors.
> 
> The generic version in <linux/pgtable.h> returns false. x86 and arm64
> override with try_cmpxchg-based implementations on the underlying pteval.
> Other architectures get the false stub - the callers there already fall
> through to oops.
> 
> v2: Rename to ptep_try_set(). Tighten kerneldoc for kernel-PTE use.
>     (David, Alexei)
> 
> Suggested-by: Kumar Kartikeya Dwivedi <memxor@gmail.com>
> Suggested-by: Alexei Starovoitov <ast@kernel.org>
> Signed-off-by: Tejun Heo <tj@kernel.org>
> Cc: David Hildenbrand <david@kernel.org>
> ---
>  arch/arm64/include/asm/pgtable.h |  8 ++++++++
>  arch/x86/include/asm/pgtable.h   |  8 ++++++++
>  include/linux/pgtable.h          | 26 ++++++++++++++++++++++++++
>  3 files changed, 42 insertions(+)
> 
> diff --git a/arch/arm64/include/asm/pgtable.h b/arch/arm64/include/asm/pgtable.h
> index 9029b81ccbe8..a129be91ef2c 100644
> --- a/arch/arm64/include/asm/pgtable.h
> +++ b/arch/arm64/include/asm/pgtable.h
> @@ -1830,6 +1830,14 @@ static inline pte_t ptep_get_and_clear(struct mm_struct *mm,
>  	return __ptep_get_and_clear(mm, addr, ptep);
>  }
>  
> +static inline bool ptep_try_set(pte_t *ptep, pte_t new_pte)
> +{
> +	pteval_t old = 0;
> +
> +	return try_cmpxchg(&pte_val(*ptep), &old, pte_val(new_pte));
> +}
> +#define ptep_try_set ptep_try_set
> +
>  #define test_and_clear_young_ptes test_and_clear_young_ptes
>  static inline bool test_and_clear_young_ptes(struct vm_area_struct *vma,
>  		unsigned long addr, pte_t *ptep, unsigned int nr)
> diff --git a/arch/x86/include/asm/pgtable.h b/arch/x86/include/asm/pgtable.h
> index 13e3e9a054cb..047e273a4eab 100644
> --- a/arch/x86/include/asm/pgtable.h
> +++ b/arch/x86/include/asm/pgtable.h
> @@ -1284,6 +1284,14 @@ static inline void ptep_set_wrprotect(struct mm_struct *mm,
>  	} while (!try_cmpxchg((long *)&ptep->pte, (long *)&old_pte, *(long *)&new_pte));
>  }
>  
> +static inline bool ptep_try_set(pte_t *ptep, pte_t new_pte)
> +{
> +	pte_t old_pte = __pte(0);
> +
> +	return try_cmpxchg((long *)&ptep->pte, (long *)&old_pte, *(long *)&new_pte);
> +}

Minor nit (feel free to ignore), on x86 pte_none() is defined as:

static inline int pte_none(pte_t pte)
{
	return !(pte.pte & ~(_PAGE_KNL_ERRATUM_MASK));
}

With:

#if defined(CONFIG_X86_64) || defined(CONFIG_X86_PAE)
#define _PAGE_KNL_ERRATUM_MASK (_PAGE_DIRTY | _PAGE_ACCESSED)
#else
#define _PAGE_KNL_ERRATUM_MASK 0
#endif

If that mask has the D/A bits set, try_cmpxchg(..., &old=0, ...) will reject a
PTE that has only those bits set, even though pte_none() would return true. I
think this is fine for the bpf_arena use case, since hardware shouldn't set A/D
for fresh pages that the BPF prog hasn't touched.

Maybe it's worth adding a comment (something along these lines)?

 /*
  * Note: strictly-zero compare is narrower than pte_none() (see pte_none() and
  * _PAGE_KNL_ERRATUM_MASK), but the gap is harmless in practice: HW shouldn't
  * set _PAGE_DIRTY | _PAGE_ACCESSED bits on entries the caller never touched.
  */

Other than that, looks good to me.

Reviewed-by: Andrea Righi <arighi@nvidia.com>

Thanks,
-Andrea

> +#define ptep_try_set ptep_try_set
> +
>  #define flush_tlb_fix_spurious_fault(vma, address, ptep) do { } while (0)
>  
>  #define  __HAVE_ARCH_PMDP_SET_ACCESS_FLAGS
> diff --git a/include/linux/pgtable.h b/include/linux/pgtable.h
> index cdd68ed3ae1a..d68374f404c1 100644
> --- a/include/linux/pgtable.h
> +++ b/include/linux/pgtable.h
> @@ -1036,6 +1036,32 @@ static inline void ptep_set_wrprotect(struct mm_struct *mm, unsigned long addres
>  }
>  #endif
>  
> +#ifndef ptep_try_set
> +/**
> + * ptep_try_set - atomically set an empty kernel PTE
> + * @ptep: page table entry
> + * @new_pte: value to install
> + *
> + * Atomically set *@ptep to @new_pte iff *@ptep is pte_none(). Return
> + * true on success, false if the slot was already populated or the
> + * arch has no implementation.
> + *
> + * For special kernel page tables only - never user page tables. The
> + * caller must prevent concurrent teardown of @ptep and must accept
> + * that other writers may race. Concurrent clearers must use
> + * ptep_get_and_clear() so racing accesses agree on the outcome.
> + *
> + * Architectures opt in by providing a cmpxchg-based override and
> + * defining ptep_try_set as an identity macro. The generic stub
> + * returns false, which is correct for callers that fall through to
> + * oops on failure.
> + */
> +static inline bool ptep_try_set(pte_t *ptep, pte_t new_pte)
> +{
> +	return false;
> +}
> +#endif
> +
>  #ifndef wrprotect_ptes
>  /**
>   * wrprotect_ptes - Write-protect PTEs that map consecutive pages of the same
> -- 
> 2.54.0
> 


^ permalink raw reply	[flat|nested] 18+ messages in thread

* Re: [PATCH 7/8] sched_ext: Sub-allocator over kernel-claimed BPF arena pages
  2026-05-20 23:50 ` [PATCH 7/8] sched_ext: Sub-allocator over kernel-claimed BPF arena pages Tejun Heo
@ 2026-05-21  7:56   ` Andrea Righi
  0 siblings, 0 replies; 18+ messages in thread
From: Andrea Righi @ 2026-05-21  7:56 UTC (permalink / raw)
  To: Tejun Heo
  Cc: David Vernet, Changwoo Min, Alexei Starovoitov, Andrii Nakryiko,
	Daniel Borkmann, Martin KaFai Lau, Kumar Kartikeya Dwivedi,
	Peter Zijlstra, Catalin Marinas, Will Deacon, Thomas Gleixner,
	Ingo Molnar, Borislav Petkov, Dave Hansen, Andrew Morton,
	David Hildenbrand, Mike Rapoport, Emil Tsalapatis, sched-ext, bpf,
	x86, linux-arm-kernel, linux-mm, linux-kernel

Hi Tejun,

On Wed, May 20, 2026 at 01:50:51PM -1000, Tejun Heo wrote:
> Build a per-scheduler sub-allocator on top of pages claimed from the BPF
> arena registered in the previous patch. Subsequent kernel-managed
> arena-resident structures (e.g. per-CPU set_cmask cmask) carve their storage
> from this pool.
> 
> scx_arena_pool_init() creates a gen_pool. scx_arena_alloc() returns the
> kernel VA. On exhaustion, the pool grows by claiming more pages via
> bpf_arena_alloc_pages_sleepable(). Chunks are added at the kernel-side
> mapping address; callers translate to the BPF-arena form themselves if
> needed.
> 
> Allocations sleep (GFP_KERNEL) - they may grow the pool through vzalloc and
> arena page allocation. All current consumers run from the enable path (after
> ops.init() and the kernel-side arena auto-discovery, before validate_ops()),
> where sleeping is fine.
> 
> scx_arena_pool_destroy() walks each chunk, returns outstanding ranges to the
> gen_pool with gen_pool_free() and then calls gen_pool_destroy(). The
> underlying arena pages are released when the arena map itself is torn down,
> so the pool destroy doesn't free them explicitly.
> 
> Signed-off-by: Tejun Heo <tj@kernel.org>
> ---

...

> +/*
> + * Allocate @size bytes from the arena pool. Returns kernel VA on success, NULL
> + * on failure. May grow the pool via scx_arena_grow() which sleeps. Caller must
> + * be in a GFP_KERNEL context.
> + */
> +void *scx_arena_alloc(struct scx_sched *sch, size_t size)
> +{
> +	unsigned long kern_va;
> +	u32 page_cnt;
> +
> +	might_sleep();
> +
> +	if (!sch->arena_pool)
> +		return NULL;
> +
> +	kern_va = gen_pool_alloc(sch->arena_pool, size);
> +	if (!kern_va) {
> +		page_cnt = max_t(u32, SCX_ARENA_GROW_PAGES,
> +				 (size + PAGE_SIZE - 1) >> PAGE_SHIFT);
> +		if (scx_arena_grow(sch, page_cnt))
> +			return NULL;
> +		kern_va = gen_pool_alloc(sch->arena_pool, size);
> +		if (!kern_va)
> +			return NULL;

IIUC, since @page_cnt is sized to cover @size and the new chunk is added empty
to the pool, gen_pool_alloc() here should always succeed. Should we do:

  if (WARN_ON_ONCE(!kern_va))
      return NULL;

to catch potential logical bugs / future concurrency / exotic configurations?

Thanks,
-Andrea


^ permalink raw reply	[flat|nested] 18+ messages in thread

end of thread, other threads:[~2026-05-21  7:56 UTC | newest]

Thread overview: 18+ messages (download: mbox.gz follow: Atom feed
-- links below jump to the message on this page --
2026-05-20 23:50 [PATCHSET v3 sched_ext/for-7.2] bpf/arena: Direct kernel-side access Tejun Heo
2026-05-20 23:50 ` [PATCH 1/8] mm: Add ptep_try_set() for lockless empty-slot installs Tejun Heo
2026-05-21  7:00   ` Andrea Righi
2026-05-20 23:50 ` [PATCH 2/8] bpf: Recover arena kernel faults with scratch page Tejun Heo
2026-05-21  3:16   ` Emil Tsalapatis
2026-05-20 23:50 ` [PATCH 3/8] bpf: Add sleepable variant of bpf_arena_alloc_pages for kernel callers Tejun Heo
2026-05-21  3:17   ` Emil Tsalapatis
2026-05-20 23:50 ` [PATCH 4/8] bpf: Add bpf_struct_ops_for_each_prog() Tejun Heo
2026-05-21  4:07   ` Emil Tsalapatis
2026-05-20 23:50 ` [PATCH 5/8] bpf/arena: Add bpf_arena_map_kern_vm_start() and bpf_prog_arena() Tejun Heo
2026-05-21  4:08   ` Emil Tsalapatis
2026-05-20 23:50 ` [PATCH 6/8] sched_ext: Require an arena for cid-form schedulers Tejun Heo
2026-05-21  4:15   ` Emil Tsalapatis
2026-05-20 23:50 ` [PATCH 7/8] sched_ext: Sub-allocator over kernel-claimed BPF arena pages Tejun Heo
2026-05-21  7:56   ` Andrea Righi
2026-05-20 23:50 ` [PATCH 8/8] sched_ext: Convert ops.set_cmask() to arena-resident cmask Tejun Heo
2026-05-21  4:19   ` Emil Tsalapatis
  -- strict thread matches above, loose matches on Subject: below --
2026-05-17 21:12 [PATCHSET v2 sched_ext/for-7.2] bpf/arena: Direct kernel-side access Tejun Heo
2026-05-17 21:12 ` [PATCH 8/8] sched_ext: Convert ops.set_cmask() to arena-resident cmask Tejun Heo

This is a public inbox, see mirroring instructions
for how to clone and mirror all data and code used for this inbox