* [PATCH 1/8] mm: Add ptep_try_install() for lockless empty-slot installs
2026-05-17 21:12 [PATCHSET v2 sched_ext/for-7.2] bpf/arena: Direct kernel-side access Tejun Heo
@ 2026-05-17 21:12 ` Tejun Heo
2026-05-18 8:06 ` David Hildenbrand (Arm)
2026-05-17 21:12 ` [PATCH 2/8] bpf: Recover arena kernel faults with scratch page Tejun Heo
` (6 subsequent siblings)
7 siblings, 1 reply; 18+ messages in thread
From: Tejun Heo @ 2026-05-17 21:12 UTC (permalink / raw)
To: David Vernet, Andrea Righi, Changwoo Min, Alexei Starovoitov,
Andrii Nakryiko, Daniel Borkmann, Martin KaFai Lau,
Kumar Kartikeya Dwivedi
Cc: Catalin Marinas, Will Deacon, Thomas Gleixner, Ingo Molnar,
Borislav Petkov, Dave Hansen, Andrew Morton, David Hildenbrand,
Mike Rapoport, Emil Tsalapatis, sched-ext, bpf, x86,
linux-arm-kernel, linux-mm, linux-kernel, Tejun Heo
Add ptep_try_install(ptep, new_pte): atomically set *ptep to new_pte
iff it is currently pte_none(). Returns true on success, false if the
slot was already populated or the arch has no implementation.
The intended caller is the upcoming bpf_arena kernel-side fault
recovery path. The install runs from a page fault and may have to
contend with locks already held by the faulting kernel caller, so
keeping it lock-free via cmpxchg is the safe choice.
The generic version in <linux/pgtable.h> returns false. x86 and arm64
override with try_cmpxchg-based implementations on the underlying
pteval. Other architectures get the false stub - the callers there
already fall through to oops.
Suggested-by: Kumar Kartikeya Dwivedi <memxor@gmail.com>
Suggested-by: Alexei Starovoitov <ast@kernel.org>
Signed-off-by: Tejun Heo <tj@kernel.org>
---
arch/arm64/include/asm/pgtable.h | 8 ++++++++
arch/x86/include/asm/pgtable.h | 8 ++++++++
include/linux/pgtable.h | 16 ++++++++++++++++
3 files changed, 32 insertions(+)
diff --git a/arch/arm64/include/asm/pgtable.h b/arch/arm64/include/asm/pgtable.h
index 9029b81ccbe8..eb1dd59aaea3 100644
--- a/arch/arm64/include/asm/pgtable.h
+++ b/arch/arm64/include/asm/pgtable.h
@@ -1830,6 +1830,14 @@ static inline pte_t ptep_get_and_clear(struct mm_struct *mm,
return __ptep_get_and_clear(mm, addr, ptep);
}
+#define __HAVE_ARCH_PTEP_TRY_INSTALL
+static inline bool ptep_try_install(pte_t *ptep, pte_t new_pte)
+{
+ pteval_t old = 0;
+
+ return try_cmpxchg(&pte_val(*ptep), &old, pte_val(new_pte));
+}
+
#define test_and_clear_young_ptes test_and_clear_young_ptes
static inline bool test_and_clear_young_ptes(struct vm_area_struct *vma,
unsigned long addr, pte_t *ptep, unsigned int nr)
diff --git a/arch/x86/include/asm/pgtable.h b/arch/x86/include/asm/pgtable.h
index 13e3e9a054cb..afff54a77ffd 100644
--- a/arch/x86/include/asm/pgtable.h
+++ b/arch/x86/include/asm/pgtable.h
@@ -1284,6 +1284,14 @@ static inline void ptep_set_wrprotect(struct mm_struct *mm,
} while (!try_cmpxchg((long *)&ptep->pte, (long *)&old_pte, *(long *)&new_pte));
}
+#define __HAVE_ARCH_PTEP_TRY_INSTALL
+static inline bool ptep_try_install(pte_t *ptep, pte_t new_pte)
+{
+ pte_t old_pte = __pte(0);
+
+ return try_cmpxchg((long *)&ptep->pte, (long *)&old_pte, *(long *)&new_pte);
+}
+
#define flush_tlb_fix_spurious_fault(vma, address, ptep) do { } while (0)
#define __HAVE_ARCH_PMDP_SET_ACCESS_FLAGS
diff --git a/include/linux/pgtable.h b/include/linux/pgtable.h
index cdd68ed3ae1a..60a4afdc9131 100644
--- a/include/linux/pgtable.h
+++ b/include/linux/pgtable.h
@@ -1036,6 +1036,22 @@ static inline void ptep_set_wrprotect(struct mm_struct *mm, unsigned long addres
}
#endif
+#ifndef __HAVE_ARCH_PTEP_TRY_INSTALL
+/**
+ * ptep_try_install - atomically install an empty PTE
+ * @ptep: page table entry
+ * @new_pte: value to install
+ *
+ * Atomically set *@ptep to @new_pte iff *@ptep is pte_none(). Return
+ * true on success. Architectures opt in by providing a cmpxchg-based
+ * override. The generic stub returns false.
+ */
+static inline bool ptep_try_install(pte_t *ptep, pte_t new_pte)
+{
+ return false;
+}
+#endif
+
#ifndef wrprotect_ptes
/**
* wrprotect_ptes - Write-protect PTEs that map consecutive pages of the same
--
2.54.0
^ permalink raw reply related [flat|nested] 18+ messages in thread* Re: [PATCH 1/8] mm: Add ptep_try_install() for lockless empty-slot installs
2026-05-17 21:12 ` [PATCH 1/8] mm: Add ptep_try_install() for lockless empty-slot installs Tejun Heo
@ 2026-05-18 8:06 ` David Hildenbrand (Arm)
2026-05-18 19:53 ` Tejun Heo
0 siblings, 1 reply; 18+ messages in thread
From: David Hildenbrand (Arm) @ 2026-05-18 8:06 UTC (permalink / raw)
To: Tejun Heo, David Vernet, Andrea Righi, Changwoo Min,
Alexei Starovoitov, Andrii Nakryiko, Daniel Borkmann,
Martin KaFai Lau, Kumar Kartikeya Dwivedi
Cc: Catalin Marinas, Will Deacon, Thomas Gleixner, Ingo Molnar,
Borislav Petkov, Dave Hansen, Andrew Morton, Mike Rapoport,
Emil Tsalapatis, sched-ext, bpf, x86, linux-arm-kernel, linux-mm,
linux-kernel
On 5/17/26 23:12, Tejun Heo wrote:
> Add ptep_try_install(ptep, new_pte): atomically set *ptep to new_pte
> iff it is currently pte_none(). Returns true on success, false if the
> slot was already populated or the arch has no implementation.
>
> The intended caller is the upcoming bpf_arena kernel-side fault
> recovery path. The install runs from a page fault and may have to
> contend with locks already held by the faulting kernel caller, so
> keeping it lock-free via cmpxchg is the safe choice.
>
> The generic version in <linux/pgtable.h> returns false. x86 and arm64
> override with try_cmpxchg-based implementations on the underlying
> pteval. Other architectures get the false stub - the callers there
> already fall through to oops.
>
> Suggested-by: Kumar Kartikeya Dwivedi <memxor@gmail.com>
> Suggested-by: Alexei Starovoitov <ast@kernel.org>
> Signed-off-by: Tejun Heo <tj@kernel.org>
> ---
[...]
>
> +#ifndef __HAVE_ARCH_PTEP_TRY_INSTALL
> +/**
> + * ptep_try_install - atomically install an empty PTE
> + * @ptep: page table entry
> + * @new_pte: value to install
> + *
> + * Atomically set *@ptep to @new_pte iff *@ptep is pte_none(). Return
> + * true on success. Architectures opt in by providing a cmpxchg-based
> + * override. The generic stub returns false.
> + */
> +static inline bool ptep_try_install(pte_t *ptep, pte_t new_pte)
> +{
> + return false;
> +}
> +#endif
Ehm, what?
This is a very, very, very bad generic idea/interface.
On which ptes is this supposed to be used? User ptes or kernel ptes?
Surely we don't want this on user ptes.
--
Cheers,
David
^ permalink raw reply [flat|nested] 18+ messages in thread* Re: [PATCH 1/8] mm: Add ptep_try_install() for lockless empty-slot installs
2026-05-18 8:06 ` David Hildenbrand (Arm)
@ 2026-05-18 19:53 ` Tejun Heo
2026-05-19 8:00 ` David Hildenbrand (Arm)
0 siblings, 1 reply; 18+ messages in thread
From: Tejun Heo @ 2026-05-18 19:53 UTC (permalink / raw)
To: David Hildenbrand (Arm)
Cc: David Vernet, Andrea Righi, Changwoo Min, Alexei Starovoitov,
Andrii Nakryiko, Daniel Borkmann, Martin KaFai Lau,
Kumar Kartikeya Dwivedi, Catalin Marinas, Will Deacon,
Thomas Gleixner, Ingo Molnar, Borislav Petkov, Dave Hansen,
Andrew Morton, Mike Rapoport, Emil Tsalapatis, sched-ext, bpf,
x86, linux-arm-kernel, linux-mm, linux-kernel
Hello,
On Mon, May 18, 2026 at 10:06:29AM +0200, David Hildenbrand (Arm) wrote:
> On 5/17/26 23:12, Tejun Heo wrote:
> > +static inline bool ptep_try_install(pte_t *ptep, pte_t new_pte)
> > +{
> > + return false;
> > +}
> > +#endif
>
> Ehm, what?
>
> This is a very, very, very bad generic idea/interface.
>
> On which ptes is this supposed to be used? User ptes or kernel ptes?
>
> Surely we don't want this on user ptes.
Yeah, this is only for the BPF arena PTEs which are already managed in their
own way. I'd be happy to place / gate it however appropriate.
Thanks.
--
tejun
^ permalink raw reply [flat|nested] 18+ messages in thread* Re: [PATCH 1/8] mm: Add ptep_try_install() for lockless empty-slot installs
2026-05-18 19:53 ` Tejun Heo
@ 2026-05-19 8:00 ` David Hildenbrand (Arm)
2026-05-19 8:58 ` Tejun Heo
0 siblings, 1 reply; 18+ messages in thread
From: David Hildenbrand (Arm) @ 2026-05-19 8:00 UTC (permalink / raw)
To: Tejun Heo
Cc: David Vernet, Andrea Righi, Changwoo Min, Alexei Starovoitov,
Andrii Nakryiko, Daniel Borkmann, Martin KaFai Lau,
Kumar Kartikeya Dwivedi, Catalin Marinas, Will Deacon,
Thomas Gleixner, Ingo Molnar, Borislav Petkov, Dave Hansen,
Andrew Morton, Mike Rapoport, Emil Tsalapatis, sched-ext, bpf,
x86, linux-arm-kernel, linux-mm, linux-kernel
On 5/18/26 21:53, Tejun Heo wrote:
> Hello,
>
> On Mon, May 18, 2026 at 10:06:29AM +0200, David Hildenbrand (Arm) wrote:
>> On 5/17/26 23:12, Tejun Heo wrote:
>>> +static inline bool ptep_try_install(pte_t *ptep, pte_t new_pte)
>>> +{
>>> + return false;
>>> +}
>>> +#endif
>>
>> Ehm, what?
>>
>> This is a very, very, very bad generic idea/interface.
>>
>> On which ptes is this supposed to be used? User ptes or kernel ptes?
>>
>> Surely we don't want this on user ptes.
>
> Yeah, this is only for the BPF arena PTEs which are already managed in their
> own way. I'd be happy to place / gate it however appropriate.
So, we only use it within apply_to_page_range() with init_mm, where we don't use
page table locks.
Usually we seem to grab the &arena->spinlock to protect the kernel page tables.
And you're saying that we might get called from a page fault handler where that
lock can already be held.
Is that really possible? I'd much rather prefer to trylock and retry, unless
that can really result in deadlocks. But I have the feeling that such deadlocks
should be impossible here.
Can you elaborate on the deadlock situation and how that can happen in valid
scenarios?
The thing is that ptep_try_install() relies on other page table walking and
insertion code to do the right thing. Shaky.
For example, staring at apply_range_set_cb(), what prevents:
(1) apply_range_set_cb() finding pte_none(ptep_get(pte)
(2) apply_range_set_scratch_cb() succeeding ptep_try_install()
(3) apply_range_set_cb() overwriting the pte with set_pte_at()
Between (2) and (3) CPUs could access the scratch PTE.
--
Cheers,
David
^ permalink raw reply [flat|nested] 18+ messages in thread* Re: [PATCH 1/8] mm: Add ptep_try_install() for lockless empty-slot installs
2026-05-19 8:00 ` David Hildenbrand (Arm)
@ 2026-05-19 8:58 ` Tejun Heo
2026-05-19 9:05 ` David Hildenbrand (Arm)
0 siblings, 1 reply; 18+ messages in thread
From: Tejun Heo @ 2026-05-19 8:58 UTC (permalink / raw)
To: David Hildenbrand (Arm)
Cc: David Vernet, Andrea Righi, Changwoo Min, Alexei Starovoitov,
Andrii Nakryiko, Daniel Borkmann, Martin KaFai Lau,
Kumar Kartikeya Dwivedi, Catalin Marinas, Will Deacon,
Thomas Gleixner, Ingo Molnar, Borislav Petkov, Dave Hansen,
Andrew Morton, Mike Rapoport, Emil Tsalapatis, sched-ext, bpf,
x86, linux-arm-kernel, linux-mm, linux-kernel
Hello, David.
On Tue, May 19, 2026 at 10:00:39AM +0200, David Hildenbrand (Arm) wrote:
> Is that really possible? I'd much rather prefer to trylock and retry, unless
> that can really result in deadlocks. But I have the feeling that such deadlocks
> should be impossible here.
I'm not well versed in either mm or BPF, so the BPF folks will have a
better take. But here's a scenario that seemed plausible to me:
1. A bpf prog calls bpf_arena_alloc_pages() on its arena. The kernel
takes arena->spinlock via raw_res_spin_lock_irqsave().
2. Under the lock, the alloc path goes through bpf_map_alloc_pages()
-> alloc_pages_node(), which fires trace_mm_page_alloc().
3. A BPF tracepoint program on mm_page_alloc that shares the arena
starts running with the lock still held.
4. The tracepoint program calls a kfunc, passing an arena pointer
one entry past the array it meant to touch.
5. The kfunc dereferences. The kernel-side address is unbacked, so
the CPU faults.
trylock + retry at 5 would A-A deadlock.
> For example, staring at apply_range_set_cb(), what prevents:
>
> (1) apply_range_set_cb() finding pte_none(ptep_get(pte)
> (2) apply_range_set_scratch_cb() succeeding ptep_try_install()
> (3) apply_range_set_cb() overwriting the pte with set_pte_at()
>
> Between (2) and (3) CPUs could access the scratch PTE.
Scratch only gets installed when BPF passes an unallocated arena
address to the kernel side, which is itself the violation, reported
through the program's BPF stream. Behavior at that addr is then
undefined. For scx, the scheduler should be aborted and torn down.
The only requirements are that the kernel doesn't oops and the
violation gets caught. Beyond that, behavior at the address is
unspecified, and which installer wins the race doesn't matter as
long as kernel integrity holds.
Thanks.
--
tejun
^ permalink raw reply [flat|nested] 18+ messages in thread
* Re: [PATCH 1/8] mm: Add ptep_try_install() for lockless empty-slot installs
2026-05-19 8:58 ` Tejun Heo
@ 2026-05-19 9:05 ` David Hildenbrand (Arm)
2026-05-19 9:40 ` David Hildenbrand (Arm)
0 siblings, 1 reply; 18+ messages in thread
From: David Hildenbrand (Arm) @ 2026-05-19 9:05 UTC (permalink / raw)
To: Tejun Heo
Cc: David Vernet, Andrea Righi, Changwoo Min, Alexei Starovoitov,
Andrii Nakryiko, Daniel Borkmann, Martin KaFai Lau,
Kumar Kartikeya Dwivedi, Catalin Marinas, Will Deacon,
Thomas Gleixner, Ingo Molnar, Borislav Petkov, Dave Hansen,
Andrew Morton, Mike Rapoport, Emil Tsalapatis, sched-ext, bpf,
x86, linux-arm-kernel, linux-mm, linux-kernel
On 5/19/26 10:58, Tejun Heo wrote:
> Hello, David.
>
> On Tue, May 19, 2026 at 10:00:39AM +0200, David Hildenbrand (Arm) wrote:
>> Is that really possible? I'd much rather prefer to trylock and retry, unless
>> that can really result in deadlocks. But I have the feeling that such deadlocks
>> should be impossible here.
>
> I'm not well versed in either mm or BPF, so the BPF folks will have a
> better take. But here's a scenario that seemed plausible to me:
>
> 1. A bpf prog calls bpf_arena_alloc_pages() on its arena. The kernel
> takes arena->spinlock via raw_res_spin_lock_irqsave().
> 2. Under the lock, the alloc path goes through bpf_map_alloc_pages()
> -> alloc_pages_node(), which fires trace_mm_page_alloc().
> 3. A BPF tracepoint program on mm_page_alloc that shares the arena
> starts running with the lock still held.
> 4. The tracepoint program calls a kfunc, passing an arena pointer
> one entry past the array it meant to touch.
> 5. The kfunc dereferences. The kernel-side address is unbacked, so
> the CPU faults.
>
> trylock + retry at 5 would A-A deadlock.
Okay, so removing that specific tracepoint (or rather, any tracpoints under the
lock) would solve the problem, right?
>
>> For example, staring at apply_range_set_cb(), what prevents:
>>
>> (1) apply_range_set_cb() finding pte_none(ptep_get(pte)
>> (2) apply_range_set_scratch_cb() succeeding ptep_try_install()
>> (3) apply_range_set_cb() overwriting the pte with set_pte_at()
>>
>> Between (2) and (3) CPUs could access the scratch PTE.
>
> Scratch only gets installed when BPF passes an unallocated arena
> address to the kernel side, which is itself the violation, reported
> through the program's BPF stream. Behavior at that addr is then
> undefined. For scx, the scheduler should be aborted and torn down.
>
> The only requirements are that the kernel doesn't oops and the
> violation gets caught. Beyond that, behavior at the address is
> unspecified, and which installer wins the race doesn't matter as
> long as kernel integrity holds.
You'll have inconsistent TLB state.
I really don't like that approach.
We should really try to just take the lock, and remove any code under the lock
that could trigger such unpleasant deadlocks.
Is that feasible?
--
Cheers,
David
^ permalink raw reply [flat|nested] 18+ messages in thread
* Re: [PATCH 1/8] mm: Add ptep_try_install() for lockless empty-slot installs
2026-05-19 9:05 ` David Hildenbrand (Arm)
@ 2026-05-19 9:40 ` David Hildenbrand (Arm)
0 siblings, 0 replies; 18+ messages in thread
From: David Hildenbrand (Arm) @ 2026-05-19 9:40 UTC (permalink / raw)
To: Tejun Heo
Cc: David Vernet, Andrea Righi, Changwoo Min, Alexei Starovoitov,
Andrii Nakryiko, Daniel Borkmann, Martin KaFai Lau,
Kumar Kartikeya Dwivedi, Catalin Marinas, Will Deacon,
Thomas Gleixner, Ingo Molnar, Borislav Petkov, Dave Hansen,
Andrew Morton, Mike Rapoport, Emil Tsalapatis, sched-ext, bpf,
x86, linux-arm-kernel, linux-mm, linux-kernel
On 5/19/26 11:05, David Hildenbrand (Arm) wrote:
> On 5/19/26 10:58, Tejun Heo wrote:
>> Hello, David.
>>
>> On Tue, May 19, 2026 at 10:00:39AM +0200, David Hildenbrand (Arm) wrote:
>>> Is that really possible? I'd much rather prefer to trylock and retry, unless
>>> that can really result in deadlocks. But I have the feeling that such deadlocks
>>> should be impossible here.
>>
>> I'm not well versed in either mm or BPF, so the BPF folks will have a
>> better take. But here's a scenario that seemed plausible to me:
>>
>> 1. A bpf prog calls bpf_arena_alloc_pages() on its arena. The kernel
>> takes arena->spinlock via raw_res_spin_lock_irqsave().
>> 2. Under the lock, the alloc path goes through bpf_map_alloc_pages()
>> -> alloc_pages_node(), which fires trace_mm_page_alloc().
>> 3. A BPF tracepoint program on mm_page_alloc that shares the arena
>> starts running with the lock still held.
>> 4. The tracepoint program calls a kfunc, passing an arena pointer
>> one entry past the array it meant to touch.
>> 5. The kfunc dereferences. The kernel-side address is unbacked, so
>> the CPU faults.
>>
>> trylock + retry at 5 would A-A deadlock.
>
> Okay, so removing that specific tracepoint (or rather, any tracpoints under the
> lock) would solve the problem, right?
>
>>
>>> For example, staring at apply_range_set_cb(), what prevents:
>>>
>>> (1) apply_range_set_cb() finding pte_none(ptep_get(pte)
>>> (2) apply_range_set_scratch_cb() succeeding ptep_try_install()
>>> (3) apply_range_set_cb() overwriting the pte with set_pte_at()
>>>
>>> Between (2) and (3) CPUs could access the scratch PTE.
>>
>> Scratch only gets installed when BPF passes an unallocated arena
>> address to the kernel side, which is itself the violation, reported
>> through the program's BPF stream. Behavior at that addr is then
>> undefined. For scx, the scheduler should be aborted and torn down.
>>
>> The only requirements are that the kernel doesn't oops and the
>> violation gets caught. Beyond that, behavior at the address is
>> unspecified, and which installer wins the race doesn't matter as
>> long as kernel integrity holds.
>
> You'll have inconsistent TLB state.
>
> I really don't like that approach.
>
> We should really try to just take the lock, and remove any code under the lock
> that could trigger such unpleasant deadlocks.
>
> Is that feasible?
>
... or can we run into similar problems with kprobes? (I am obviously no bpf
expert ...)
--
Cheers,
David
^ permalink raw reply [flat|nested] 18+ messages in thread
* [PATCH 2/8] bpf: Recover arena kernel faults with scratch page
2026-05-17 21:12 [PATCHSET v2 sched_ext/for-7.2] bpf/arena: Direct kernel-side access Tejun Heo
2026-05-17 21:12 ` [PATCH 1/8] mm: Add ptep_try_install() for lockless empty-slot installs Tejun Heo
@ 2026-05-17 21:12 ` Tejun Heo
2026-05-17 21:12 ` [PATCH 3/8] bpf: Add sleepable variant of bpf_arena_alloc_pages for kernel callers Tejun Heo
` (5 subsequent siblings)
7 siblings, 0 replies; 18+ messages in thread
From: Tejun Heo @ 2026-05-17 21:12 UTC (permalink / raw)
To: David Vernet, Andrea Righi, Changwoo Min, Alexei Starovoitov,
Andrii Nakryiko, Daniel Borkmann, Martin KaFai Lau,
Kumar Kartikeya Dwivedi
Cc: Catalin Marinas, Will Deacon, Thomas Gleixner, Ingo Molnar,
Borislav Petkov, Dave Hansen, Andrew Morton, David Hildenbrand,
Mike Rapoport, Emil Tsalapatis, sched-ext, bpf, x86,
linux-arm-kernel, linux-mm, linux-kernel, Tejun Heo
From: Kumar Kartikeya Dwivedi <memxor@gmail.com>
BPF arena usage is becoming more prevalent, but kernel <-> BPF communication
over arena memory is awkward today. Data has to be staged through a trusted
kernel pointer with extra code and copying on the BPF side. While reads
through arena pointers can use a fault-safe helper, writes don't have a good
solution; the in-line alternative would need instruction emulation or asm
fixup labels.
Enable direct kernel-side reads and writes within GUARD_SZ / 2 of any
handed-in arena pointer, without bounds checking. A per-arena scratch page
is installed by the arch fault path into empty arena kernel PTEs - x86 from
page_fault_oops() for not-present faults, arm64 from __do_kernel_fault() for
translation faults, both after the existing exception-table and KFENCE
handling. The faulting instruction retries and the access is also reported
through the program's BPF stream, preserving error reporting.
bpf_prog_find_from_stack() resolves the current BPF program (and its arena)
from the kernel stack - no new bpf_run_ctx state is added. Recovery covers
the 4 GiB arena plus the upper half-guard (GUARD_SZ / 2); the lower
half-guard is excluded because well-behaved kfuncs only access forward from
arena pointers. The kfunc-author contract - access at most GUARD_SZ / 2 past
a handed-in pointer - is documented in Documentation/bpf/kfuncs.rst.
The install is lock-free via ptep_try_install(): on race-loss the winning
installer's PTE is already valid, so the access retry succeeds. No
flush_tlb_kernel_range() afterwards - stale "not mapped" entries just cause
one extra re-fault, cheaper than a global IPI on every install.
Scratch exists only to keep the kernel from oopsing on an in-line arena
access. Its presence at a PTE means the BPF program has already
malfunctioned, and the violation is reported through the program's BPF
stream. The only requirement for behavior on a scratched PTE is that the
kernel doesn't crash; in particular, any user-side access through such a PTE
may segfault. The shared scratch page is freed once during map destruction.
BPF instruction faults continue to use the existing JIT exception-table
path; this patch changes only the kernel-text fault path. No UAPI flag is
added; the new behavior is the default.
Suggested-by: Alexei Starovoitov <ast@kernel.org>
Signed-off-by: Kumar Kartikeya Dwivedi <memxor@gmail.com>
Signed-off-by: Tejun Heo <tj@kernel.org>
---
Documentation/bpf/kfuncs.rst | 14 +++
arch/arm64/mm/fault.c | 10 ++-
arch/x86/mm/fault.c | 12 ++-
include/linux/bpf.h | 1 +
include/linux/bpf_defs.h | 11 +++
kernel/bpf/arena.c | 170 +++++++++++++++++++++++++++--------
kernel/bpf/core.c | 5 ++
7 files changed, 181 insertions(+), 42 deletions(-)
create mode 100644 include/linux/bpf_defs.h
diff --git a/Documentation/bpf/kfuncs.rst b/Documentation/bpf/kfuncs.rst
index 75e6c078e0e7..6d497e720998 100644
--- a/Documentation/bpf/kfuncs.rst
+++ b/Documentation/bpf/kfuncs.rst
@@ -462,6 +462,20 @@ In order to accommodate such requirements, the verifier will enforce strict
PTR_TO_BTF_ID type matching if two types have the exact same name, with one
being suffixed with ``___init``.
+2.8 Accessing arena memory through kfunc arguments
+--------------------------------------------------
+
+A read or write at any address inside an arena does not oops the kernel.
+Unallocated arena pages are lazily backed by a scratch page and the
+access is reported through the program's BPF stream as an error. Only
+the BPF program's correctness is affected; the kernel itself remains
+intact.
+
+The arena is followed by a ``GUARD_SZ / 2`` (32 KiB) guard region that
+is also covered by this recovery. A kfunc handed an arena pointer may
+therefore access up to ``GUARD_SZ / 2`` past it without bounds-checking
+against the arena. Larger accesses must verify the range explicitly.
+
.. _BPF_kfunc_lifecycle_expectations:
3. kfunc lifecycle expectations
diff --git a/arch/arm64/mm/fault.c b/arch/arm64/mm/fault.c
index 920a8b244d59..0d58d667fcd8 100644
--- a/arch/arm64/mm/fault.c
+++ b/arch/arm64/mm/fault.c
@@ -9,6 +9,7 @@
#include <linux/acpi.h>
#include <linux/bitfield.h>
+#include <linux/bpf_defs.h>
#include <linux/extable.h>
#include <linux/kfence.h>
#include <linux/signal.h>
@@ -416,9 +417,12 @@ static void __do_kernel_fault(unsigned long addr, unsigned long esr,
} else if (addr < PAGE_SIZE) {
msg = "NULL pointer dereference";
} else {
- if (esr_fsc_is_translation_fault(esr) &&
- kfence_handle_page_fault(addr, esr & ESR_ELx_WNR, regs))
- return;
+ if (esr_fsc_is_translation_fault(esr)) {
+ if (kfence_handle_page_fault(addr, esr & ESR_ELx_WNR, regs))
+ return;
+ if (bpf_arena_handle_page_fault(addr, esr & ESR_ELx_WNR, regs->pc))
+ return;
+ }
msg = "paging request";
}
diff --git a/arch/x86/mm/fault.c b/arch/x86/mm/fault.c
index f0e77e084482..b0f103ddbd23 100644
--- a/arch/x86/mm/fault.c
+++ b/arch/x86/mm/fault.c
@@ -8,6 +8,7 @@
#include <linux/sched/task_stack.h> /* task_stack_*(), ... */
#include <linux/kdebug.h> /* oops_begin/end, ... */
#include <linux/memblock.h> /* max_low_pfn */
+#include <linux/bpf_defs.h> /* bpf_arena_handle_page_fault */
#include <linux/kfence.h> /* kfence_handle_page_fault */
#include <linux/kprobes.h> /* NOKPROBE_SYMBOL, ... */
#include <linux/mmiotrace.h> /* kmmio_handler, ... */
@@ -688,10 +689,13 @@ page_fault_oops(struct pt_regs *regs, unsigned long error_code,
if (IS_ENABLED(CONFIG_EFI))
efi_crash_gracefully_on_page_fault(address);
- /* Only not-present faults should be handled by KFENCE. */
- if (!(error_code & X86_PF_PROT) &&
- kfence_handle_page_fault(address, error_code & X86_PF_WRITE, regs))
- return;
+ /* Only not-present faults should be handled by KFENCE or BPF arena. */
+ if (!(error_code & X86_PF_PROT)) {
+ if (kfence_handle_page_fault(address, error_code & X86_PF_WRITE, regs))
+ return;
+ if (bpf_arena_handle_page_fault(address, error_code & X86_PF_WRITE, regs->ip))
+ return;
+ }
oops:
/*
diff --git a/include/linux/bpf.h b/include/linux/bpf.h
index 0136a108d083..831996c411cf 100644
--- a/include/linux/bpf.h
+++ b/include/linux/bpf.h
@@ -6,6 +6,7 @@
#include <uapi/linux/bpf.h>
#include <uapi/linux/filter.h>
+#include <linux/bpf_defs.h>
#include <crypto/sha2.h>
#include <linux/workqueue.h>
diff --git a/include/linux/bpf_defs.h b/include/linux/bpf_defs.h
new file mode 100644
index 000000000000..d98e033b8c0b
--- /dev/null
+++ b/include/linux/bpf_defs.h
@@ -0,0 +1,11 @@
+/* SPDX-License-Identifier: GPL-2.0-or-later */
+/*
+ * Subset of bpf.h declarations, split out so files that need only these
+ * declarations can avoid bpf.h's full include cost.
+ */
+#ifndef _LINUX_BPF_DEFS_H
+#define _LINUX_BPF_DEFS_H
+
+bool bpf_arena_handle_page_fault(unsigned long addr, bool is_write, unsigned long fault_ip);
+
+#endif /* _LINUX_BPF_DEFS_H */
diff --git a/kernel/bpf/arena.c b/kernel/bpf/arena.c
index 08d008cc471e..bc696bef7104 100644
--- a/kernel/bpf/arena.c
+++ b/kernel/bpf/arena.c
@@ -53,6 +53,7 @@ struct bpf_arena {
u64 user_vm_start;
u64 user_vm_end;
struct vm_struct *kern_vm;
+ struct page *scratch_page;
struct range_tree rt;
/* protects rt */
rqspinlock_t spinlock;
@@ -118,6 +119,11 @@ struct apply_range_data {
int i;
};
+struct clear_range_data {
+ struct llist_head *free_pages;
+ struct page *scratch_page;
+};
+
static int apply_range_set_cb(pte_t *pte, unsigned long addr, void *data)
{
struct apply_range_data *d = data;
@@ -144,8 +150,9 @@ static void flush_vmap_cache(unsigned long start, unsigned long size)
flush_cache_vmap(start, start + size);
}
-static int apply_range_clear_cb(pte_t *pte, unsigned long addr, void *free_pages)
+static int apply_range_clear_cb(pte_t *pte, unsigned long addr, void *data)
{
+ struct clear_range_data *d = data;
pte_t old_pte;
struct page *page;
@@ -160,17 +167,45 @@ static int apply_range_clear_cb(pte_t *pte, unsigned long addr, void *free_pages
pte_clear(&init_mm, addr, pte);
+ if (unlikely(!d))
+ return 0;
+
+ /*
+ * Skip the per-arena scratch page. A kernel fault on an unallocated uaddr
+ * scratches its PTE; a later bpf_arena_free_pages() over that range walks
+ * here. Without the skip, scratch_page would be freed.
+ */
+ if (page == d->scratch_page)
+ return 0;
+
/* Add page to the list so it is freed later */
- if (free_pages)
- __llist_add(&page->pcp_llist, free_pages);
+ __llist_add(&page->pcp_llist, d->free_pages);
+ return 0;
+}
+
+static int apply_range_set_scratch_cb(pte_t *pte, unsigned long addr, void *data)
+{
+ struct page *scratch_page = data;
+ if (!pte_none(ptep_get(pte)))
+ return 0;
+ /*
+ * Best-effort install. ptep_try_install() returns false only if another
+ * installer (real allocation or concurrent fault) won the cmpxchg; their
+ * PTE is already valid, so the access retry succeeds.
+ *
+ * No flush_tlb_kernel_range() needed: stale "not mapped" entries just
+ * cause one extra re-fault through this same path.
+ */
+ ptep_try_install(pte, mk_pte(scratch_page, PAGE_KERNEL));
return 0;
}
static int populate_pgtable_except_pte(struct bpf_arena *arena)
{
+ /* Populate intermediates for the recovery range (4 GiB + upper half-guard). */
return apply_to_page_range(&init_mm, bpf_arena_get_kern_vm_start(arena),
- KERN_VM_SZ - GUARD_SZ, apply_range_set_cb, NULL);
+ SZ_4G + GUARD_SZ / 2, apply_range_set_cb, NULL);
}
static struct bpf_map *arena_map_alloc(union bpf_attr *attr)
@@ -221,22 +256,29 @@ static struct bpf_map *arena_map_alloc(union bpf_attr *attr)
init_irq_work(&arena->free_irq, arena_free_irq);
INIT_WORK(&arena->free_work, arena_free_worker);
bpf_map_init_from_attr(&arena->map, attr);
+
+ err = bpf_map_alloc_pages(&arena->map, NUMA_NO_NODE, 1, &arena->scratch_page);
+ if (err)
+ goto err_free_arena;
+
range_tree_init(&arena->rt);
err = range_tree_set(&arena->rt, 0, attr->max_entries);
- if (err) {
- bpf_map_area_free(arena);
- goto err;
- }
+ if (err)
+ goto err_free_scratch;
mutex_init(&arena->lock);
raw_res_spin_lock_init(&arena->spinlock);
err = populate_pgtable_except_pte(arena);
- if (err) {
- range_tree_destroy(&arena->rt);
- bpf_map_area_free(arena);
- goto err;
- }
+ if (err)
+ goto err_destroy_rt;
return &arena->map;
+
+err_destroy_rt:
+ range_tree_destroy(&arena->rt);
+err_free_scratch:
+ __free_page(arena->scratch_page);
+err_free_arena:
+ bpf_map_area_free(arena);
err:
free_vm_area(kern_vm);
return ERR_PTR(err);
@@ -244,6 +286,7 @@ static struct bpf_map *arena_map_alloc(union bpf_attr *attr)
static int existing_page_cb(pte_t *ptep, unsigned long addr, void *data)
{
+ struct bpf_arena *arena = data;
struct page *page;
pte_t pte;
@@ -251,6 +294,12 @@ static int existing_page_cb(pte_t *ptep, unsigned long addr, void *data)
if (!pte_present(pte)) /* sanity check */
return 0;
page = pte_page(pte);
+ /*
+ * Skip the scratch page. The walk is page-table-driven, not range-tree-driven,
+ * so it can visit scratch PTEs at uaddrs the BPF program never allocated.
+ */
+ if (page == arena->scratch_page)
+ return 0;
/*
* We do not update pte here:
* 1. Nobody should be accessing bpf_arena's range outside of a kernel bug
@@ -286,9 +335,10 @@ static void arena_map_free(struct bpf_map *map)
* free those pages.
*/
apply_to_existing_page_range(&init_mm, bpf_arena_get_kern_vm_start(arena),
- KERN_VM_SZ - GUARD_SZ, existing_page_cb, NULL);
+ SZ_4G + GUARD_SZ / 2, existing_page_cb, arena);
free_vm_area(arena->kern_vm);
range_tree_destroy(&arena->rt);
+ __free_page(arena->scratch_page);
bpf_map_area_free(arena);
}
@@ -374,33 +424,37 @@ static vm_fault_t arena_vm_fault(struct vm_fault *vmf)
return VM_FAULT_RETRY;
page = vmalloc_to_page((void *)kaddr);
- if (page)
+ if (page) {
+ if (page == arena->scratch_page)
+ /* BPF triggered scratch here; don't lazy-alloc over it */
+ goto out_sigsegv;
/* already have a page vmap-ed */
goto out;
+ }
bpf_map_memcg_enter(&arena->map, &old_memcg, &new_memcg);
if (arena->map.map_flags & BPF_F_SEGV_ON_FAULT)
/* User space requested to segfault when page is not allocated by bpf prog */
- goto out_unlock_sigsegv;
+ goto out_sigsegv_memcg;
ret = range_tree_clear(&arena->rt, vmf->pgoff, 1);
if (ret)
- goto out_unlock_sigsegv;
+ goto out_sigsegv_memcg;
struct apply_range_data data = { .pages = &page, .i = 0 };
/* Account into memcg of the process that created bpf_arena */
ret = bpf_map_alloc_pages(map, NUMA_NO_NODE, 1, &page);
if (ret) {
range_tree_set(&arena->rt, vmf->pgoff, 1);
- goto out_unlock_sigsegv;
+ goto out_sigsegv_memcg;
}
ret = apply_to_page_range(&init_mm, kaddr, PAGE_SIZE, apply_range_set_cb, &data);
if (ret) {
range_tree_set(&arena->rt, vmf->pgoff, 1);
free_pages_nolock(page, 0);
- goto out_unlock_sigsegv;
+ goto out_sigsegv_memcg;
}
flush_vmap_cache(kaddr, PAGE_SIZE);
bpf_map_memcg_exit(old_memcg, new_memcg);
@@ -409,8 +463,9 @@ static vm_fault_t arena_vm_fault(struct vm_fault *vmf)
raw_res_spin_unlock_irqrestore(&arena->spinlock, flags);
vmf->page = page;
return 0;
-out_unlock_sigsegv:
+out_sigsegv_memcg:
bpf_map_memcg_exit(old_memcg, new_memcg);
+out_sigsegv:
raw_res_spin_unlock_irqrestore(&arena->spinlock, flags);
return VM_FAULT_SIGSEGV;
}
@@ -668,6 +723,7 @@ static void arena_free_pages(struct bpf_arena *arena, long uaddr, long page_cnt,
struct llist_head free_pages;
struct llist_node *pos, *t;
struct arena_free_span *s;
+ struct clear_range_data cdata;
unsigned long flags;
int ret = 0;
@@ -696,9 +752,11 @@ static void arena_free_pages(struct bpf_arena *arena, long uaddr, long page_cnt,
range_tree_set(&arena->rt, pgoff, page_cnt);
init_llist_head(&free_pages);
+ cdata.free_pages = &free_pages;
+ cdata.scratch_page = arena->scratch_page;
/* clear ptes and collect struct pages */
apply_to_existing_page_range(&init_mm, kaddr, page_cnt << PAGE_SHIFT,
- apply_range_clear_cb, &free_pages);
+ apply_range_clear_cb, &cdata);
/* drop the lock to do the tlb flush and zap pages */
raw_res_spin_unlock_irqrestore(&arena->spinlock, flags);
@@ -788,6 +846,7 @@ static void arena_free_worker(struct work_struct *work)
struct arena_free_span *s;
u64 arena_vm_start, user_vm_start;
struct llist_head free_pages;
+ struct clear_range_data cdata;
struct page *page;
unsigned long full_uaddr;
long kaddr, page_cnt, pgoff;
@@ -801,6 +860,8 @@ static void arena_free_worker(struct work_struct *work)
bpf_map_memcg_enter(&arena->map, &old_memcg, &new_memcg);
init_llist_head(&free_pages);
+ cdata.free_pages = &free_pages;
+ cdata.scratch_page = arena->scratch_page;
arena_vm_start = bpf_arena_get_kern_vm_start(arena);
user_vm_start = bpf_arena_get_user_vm_start(arena);
@@ -813,7 +874,7 @@ static void arena_free_worker(struct work_struct *work)
/* clear ptes and collect pages in free_pages llist */
apply_to_existing_page_range(&init_mm, kaddr, page_cnt << PAGE_SHIFT,
- apply_range_clear_cb, &free_pages);
+ apply_range_clear_cb, &cdata);
range_tree_set(&arena->rt, pgoff, page_cnt);
}
@@ -928,23 +989,12 @@ static int __init kfunc_init(void)
}
late_initcall(kfunc_init);
-void bpf_prog_report_arena_violation(bool write, unsigned long addr, unsigned long fault_ip)
+static void __bpf_prog_report_arena_violation(struct bpf_prog *prog, bool write,
+ unsigned long addr, unsigned long fault_ip)
{
struct bpf_stream_stage ss;
- struct bpf_prog *prog;
u64 user_vm_start;
- /*
- * The RCU read lock is held to safely traverse the latch tree, but we
- * don't need its protection when accessing the prog, since it will not
- * disappear while we are handling the fault.
- */
- rcu_read_lock();
- prog = bpf_prog_ksym_find(fault_ip);
- rcu_read_unlock();
- if (!prog)
- return;
-
/* Use main prog for stream access */
prog = prog->aux->main_prog_aux->prog;
@@ -957,3 +1007,53 @@ void bpf_prog_report_arena_violation(bool write, unsigned long addr, unsigned lo
bpf_stream_dump_stack(ss);
}));
}
+
+bool bpf_arena_handle_page_fault(unsigned long addr, bool is_write, unsigned long fault_ip)
+{
+ struct bpf_arena *arena;
+ struct bpf_prog *prog;
+ unsigned long kbase;
+ unsigned long page_addr = addr & PAGE_MASK;
+
+ prog = bpf_prog_find_from_stack();
+ if (!prog)
+ return false;
+
+ arena = prog->aux->arena;
+ /* a prog not using arena may be on stack, so arena can be NULL */
+ if (!arena)
+ return false;
+
+ kbase = bpf_arena_get_kern_vm_start(arena);
+
+ /*
+ * Recovery covers the 4 GiB mappable band plus the upper half-guard.
+ * Lower guard is unreachable from kfuncs; an address there indicates
+ * a different bug class - leave it to the regular kernel oops path.
+ */
+ if (page_addr < kbase || page_addr >= kbase + SZ_4G + GUARD_SZ / 2)
+ return false;
+
+ apply_to_page_range(&init_mm, page_addr, PAGE_SIZE,
+ apply_range_set_scratch_cb, arena->scratch_page);
+ flush_vmap_cache(page_addr, PAGE_SIZE);
+ __bpf_prog_report_arena_violation(prog, is_write, page_addr - kbase, fault_ip);
+ return true;
+}
+
+void bpf_prog_report_arena_violation(bool write, unsigned long addr, unsigned long fault_ip)
+{
+ struct bpf_prog *prog;
+
+ /*
+ * The RCU read lock is held to safely traverse the latch tree, but we
+ * don't need its protection when accessing the prog, since it will not
+ * disappear while we are handling the fault.
+ */
+ rcu_read_lock();
+ prog = bpf_prog_ksym_find(fault_ip);
+ rcu_read_unlock();
+ if (!prog)
+ return;
+ __bpf_prog_report_arena_violation(prog, write, addr, fault_ip);
+}
diff --git a/kernel/bpf/core.c b/kernel/bpf/core.c
index 066b86e7233c..fa368d8920d9 100644
--- a/kernel/bpf/core.c
+++ b/kernel/bpf/core.c
@@ -3290,6 +3290,11 @@ __weak u64 bpf_arena_get_kern_vm_start(struct bpf_arena *arena)
{
return 0;
}
+__weak bool bpf_arena_handle_page_fault(unsigned long addr, bool is_write,
+ unsigned long fault_ip)
+{
+ return false;
+}
#ifdef CONFIG_BPF_SYSCALL
static int __init bpf_global_ma_init(void)
--
2.54.0
^ permalink raw reply related [flat|nested] 18+ messages in thread* [PATCH 3/8] bpf: Add sleepable variant of bpf_arena_alloc_pages for kernel callers
2026-05-17 21:12 [PATCHSET v2 sched_ext/for-7.2] bpf/arena: Direct kernel-side access Tejun Heo
2026-05-17 21:12 ` [PATCH 1/8] mm: Add ptep_try_install() for lockless empty-slot installs Tejun Heo
2026-05-17 21:12 ` [PATCH 2/8] bpf: Recover arena kernel faults with scratch page Tejun Heo
@ 2026-05-17 21:12 ` Tejun Heo
2026-05-17 21:12 ` [PATCH 4/8] bpf: Add bpf_struct_ops_for_each_prog() Tejun Heo
` (4 subsequent siblings)
7 siblings, 0 replies; 18+ messages in thread
From: Tejun Heo @ 2026-05-17 21:12 UTC (permalink / raw)
To: David Vernet, Andrea Righi, Changwoo Min, Alexei Starovoitov,
Andrii Nakryiko, Daniel Borkmann, Martin KaFai Lau,
Kumar Kartikeya Dwivedi
Cc: Catalin Marinas, Will Deacon, Thomas Gleixner, Ingo Molnar,
Borislav Petkov, Dave Hansen, Andrew Morton, David Hildenbrand,
Mike Rapoport, Emil Tsalapatis, sched-ext, bpf, x86,
linux-arm-kernel, linux-mm, linux-kernel, Tejun Heo
The existing kernel-side export of bpf_arena_alloc_pages is _non_sleepable
only - it's used by the verifier to inline the kfunc when the call site is
non-sleepable. There is no sleepable equivalent for kernel callers; the
kfunc bpf_arena_alloc_pages itself is BPF-only.
sched_ext needs sleepable kernel-side allocs for its arena pool init/grow
paths. Add bpf_arena_alloc_pages_sleepable() mirroring the _non_sleepable
wrapper but passing sleepable=true to arena_alloc_pages().
Signed-off-by: Tejun Heo <tj@kernel.org>
---
include/linux/bpf.h | 8 ++++++++
kernel/bpf/arena.c | 13 +++++++++++++
2 files changed, 21 insertions(+)
diff --git a/include/linux/bpf.h b/include/linux/bpf.h
index 831996c411cf..64968ca6db51 100644
--- a/include/linux/bpf.h
+++ b/include/linux/bpf.h
@@ -679,6 +679,8 @@ int bpf_dynptr_from_file_sleepable(struct file *file, u32 flags,
void *bpf_arena_alloc_pages_non_sleepable(void *p__map, void *addr__ign, u32 page_cnt, int node_id,
u64 flags);
void bpf_arena_free_pages_non_sleepable(void *p__map, void *ptr__ign, u32 page_cnt);
+void *bpf_arena_alloc_pages_sleepable(void *p__map, void *addr__ign, u32 page_cnt, int node_id,
+ u64 flags);
#else
static inline void *bpf_arena_alloc_pages_non_sleepable(void *p__map, void *addr__ign, u32 page_cnt,
int node_id, u64 flags)
@@ -689,6 +691,12 @@ static inline void *bpf_arena_alloc_pages_non_sleepable(void *p__map, void *addr
static inline void bpf_arena_free_pages_non_sleepable(void *p__map, void *ptr__ign, u32 page_cnt)
{
}
+
+static inline void *bpf_arena_alloc_pages_sleepable(void *p__map, void *addr__ign, u32 page_cnt,
+ int node_id, u64 flags)
+{
+ return NULL;
+}
#endif
extern const struct bpf_map_ops bpf_map_offload_ops;
diff --git a/kernel/bpf/arena.c b/kernel/bpf/arena.c
index bc696bef7104..8eb7b95f4999 100644
--- a/kernel/bpf/arena.c
+++ b/kernel/bpf/arena.c
@@ -937,6 +937,19 @@ void *bpf_arena_alloc_pages_non_sleepable(void *p__map, void *addr__ign, u32 pag
return (void *)arena_alloc_pages(arena, (long)addr__ign, page_cnt, node_id, false);
}
+
+void *bpf_arena_alloc_pages_sleepable(void *p__map, void *addr__ign, u32 page_cnt,
+ int node_id, u64 flags)
+{
+ struct bpf_map *map = p__map;
+ struct bpf_arena *arena = container_of(map, struct bpf_arena, map);
+
+ if (map->map_type != BPF_MAP_TYPE_ARENA || flags || !page_cnt)
+ return NULL;
+
+ return (void *)arena_alloc_pages(arena, (long)addr__ign, page_cnt, node_id, true);
+}
+
__bpf_kfunc void bpf_arena_free_pages(void *p__map, void *ptr__ign, u32 page_cnt)
{
struct bpf_map *map = p__map;
--
2.54.0
^ permalink raw reply related [flat|nested] 18+ messages in thread* [PATCH 4/8] bpf: Add bpf_struct_ops_for_each_prog()
2026-05-17 21:12 [PATCHSET v2 sched_ext/for-7.2] bpf/arena: Direct kernel-side access Tejun Heo
` (2 preceding siblings ...)
2026-05-17 21:12 ` [PATCH 3/8] bpf: Add sleepable variant of bpf_arena_alloc_pages for kernel callers Tejun Heo
@ 2026-05-17 21:12 ` Tejun Heo
2026-05-17 21:12 ` [PATCH 5/8] bpf/arena: Add bpf_arena_map_kern_vm_start() and bpf_prog_arena() Tejun Heo
` (3 subsequent siblings)
7 siblings, 0 replies; 18+ messages in thread
From: Tejun Heo @ 2026-05-17 21:12 UTC (permalink / raw)
To: David Vernet, Andrea Righi, Changwoo Min, Alexei Starovoitov,
Andrii Nakryiko, Daniel Borkmann, Martin KaFai Lau,
Kumar Kartikeya Dwivedi
Cc: Catalin Marinas, Will Deacon, Thomas Gleixner, Ingo Molnar,
Borislav Petkov, Dave Hansen, Andrew Morton, David Hildenbrand,
Mike Rapoport, Emil Tsalapatis, sched-ext, bpf, x86,
linux-arm-kernel, linux-mm, linux-kernel, Tejun Heo
Add a helper that walks the member progs of the struct_ops map
containing a given @kdata vmtable. struct_ops ->reg() callbacks (and
similar) sometimes need to inspect the loaded BPF programs, e.g. to
discover maps they reference via prog->aux->used_maps.
The implementation mirrors bpf_struct_ops_id(): container_of @kdata
to recover the bpf_struct_ops_map, then iterate st_map->links[i]->prog
for i in [0, funcs_cnt). Same access pattern, no new locking - by the
time ->reg() fires st_map is fully populated and stable.
A sched_ext follow-up walks the member progs of a cid-form scheduler's
struct_ops map, reads prog->aux->arena directly, and requires all member
progs to reference exactly one arena, without requiring the BPF program
to call a registration kfunc.
Signed-off-by: Tejun Heo <tj@kernel.org>
---
include/linux/bpf.h | 3 +++
kernel/bpf/bpf_struct_ops.c | 36 ++++++++++++++++++++++++++++++++++++
2 files changed, 39 insertions(+)
diff --git a/include/linux/bpf.h b/include/linux/bpf.h
index 64968ca6db51..5b99d786e98c 100644
--- a/include/linux/bpf.h
+++ b/include/linux/bpf.h
@@ -2129,6 +2129,9 @@ int bpf_prog_assoc_struct_ops(struct bpf_prog *prog, struct bpf_map *map);
void bpf_prog_disassoc_struct_ops(struct bpf_prog *prog);
void *bpf_prog_get_assoc_struct_ops(const struct bpf_prog_aux *aux);
u32 bpf_struct_ops_id(const void *kdata);
+int bpf_struct_ops_for_each_prog(const void *kdata,
+ int (*cb)(struct bpf_prog *prog, void *data),
+ void *data);
#ifdef CONFIG_NET
/* Define it here to avoid the use of forward declaration */
diff --git a/kernel/bpf/bpf_struct_ops.c b/kernel/bpf/bpf_struct_ops.c
index 05b366b821c3..16aec18ed31b 100644
--- a/kernel/bpf/bpf_struct_ops.c
+++ b/kernel/bpf/bpf_struct_ops.c
@@ -1203,6 +1203,42 @@ u32 bpf_struct_ops_id(const void *kdata)
}
EXPORT_SYMBOL_GPL(bpf_struct_ops_id);
+/**
+ * bpf_struct_ops_for_each_prog - Invoke @cb for each member prog
+ * @kdata: kernel-side struct_ops vmtable (the @kdata arg to ->reg/->update/->unreg)
+ * @cb: callback invoked once per member prog; non-zero return stops iteration
+ * @data: opaque argument passed to @cb
+ *
+ * Walks the struct_ops member progs registered on the map containing @kdata.
+ * Intended for use from struct_ops ->reg() callbacks (and similar) that need to
+ * inspect the loaded BPF programs (for example to discover maps they reference
+ * via @prog->aux->used_maps).
+ *
+ * Return 0 if iteration completed, otherwise the first non-zero @cb return.
+ */
+int bpf_struct_ops_for_each_prog(const void *kdata,
+ int (*cb)(struct bpf_prog *prog, void *data),
+ void *data)
+{
+ struct bpf_struct_ops_value *kvalue;
+ struct bpf_struct_ops_map *st_map;
+ u32 i;
+ int ret;
+
+ kvalue = container_of(kdata, struct bpf_struct_ops_value, data);
+ st_map = container_of(kvalue, struct bpf_struct_ops_map, kvalue);
+
+ for (i = 0; i < st_map->funcs_cnt; i++) {
+ if (!st_map->links[i])
+ continue;
+ ret = cb(st_map->links[i]->prog, data);
+ if (ret)
+ return ret;
+ }
+ return 0;
+}
+EXPORT_SYMBOL_GPL(bpf_struct_ops_for_each_prog);
+
static bool bpf_struct_ops_valid_to_reg(struct bpf_map *map)
{
struct bpf_struct_ops_map *st_map = (struct bpf_struct_ops_map *)map;
--
2.54.0
^ permalink raw reply related [flat|nested] 18+ messages in thread* [PATCH 5/8] bpf/arena: Add bpf_arena_map_kern_vm_start() and bpf_prog_arena()
2026-05-17 21:12 [PATCHSET v2 sched_ext/for-7.2] bpf/arena: Direct kernel-side access Tejun Heo
` (3 preceding siblings ...)
2026-05-17 21:12 ` [PATCH 4/8] bpf: Add bpf_struct_ops_for_each_prog() Tejun Heo
@ 2026-05-17 21:12 ` Tejun Heo
2026-05-17 21:12 ` [PATCH 6/8] sched_ext: Require an arena for cid-form schedulers Tejun Heo
` (2 subsequent siblings)
7 siblings, 0 replies; 18+ messages in thread
From: Tejun Heo @ 2026-05-17 21:12 UTC (permalink / raw)
To: David Vernet, Andrea Righi, Changwoo Min, Alexei Starovoitov,
Andrii Nakryiko, Daniel Borkmann, Martin KaFai Lau,
Kumar Kartikeya Dwivedi
Cc: Catalin Marinas, Will Deacon, Thomas Gleixner, Ingo Molnar,
Borislav Petkov, Dave Hansen, Andrew Morton, David Hildenbrand,
Mike Rapoport, Emil Tsalapatis, sched-ext, bpf, x86,
linux-arm-kernel, linux-mm, linux-kernel, Tejun Heo
struct bpf_arena is opaque to callers outside arena.c. Add two helpers
for struct_ops subsystems that need to reach into an arena:
bpf_arena_map_kern_vm_start(struct bpf_map *map)
returns @map's kern_vm_start. A sched_ext follow-up needs this
to translate kern_va <-> uaddr.
bpf_prog_arena(struct bpf_prog *prog)
returns the bpf_map of the arena referenced by @prog (NULL if
@prog references no arena). The verifier enforces at most one
arena per program. Used by struct_ops callers that auto-discover
an arena from a member prog and need to take a map reference.
Suggested-by: Kumar Kartikeya Dwivedi <memxor@gmail.com>
Signed-off-by: Tejun Heo <tj@kernel.org>
---
include/linux/bpf.h | 2 ++
kernel/bpf/arena.c | 26 ++++++++++++++++++++++++++
2 files changed, 28 insertions(+)
diff --git a/include/linux/bpf.h b/include/linux/bpf.h
index 5b99d786e98c..e1ba57c10aaa 100644
--- a/include/linux/bpf.h
+++ b/include/linux/bpf.h
@@ -618,6 +618,8 @@ void bpf_rb_root_free(const struct btf_field *field, void *rb_root,
struct bpf_spin_lock *spin_lock);
u64 bpf_arena_get_kern_vm_start(struct bpf_arena *arena);
u64 bpf_arena_get_user_vm_start(struct bpf_arena *arena);
+u64 bpf_arena_map_kern_vm_start(struct bpf_map *map);
+struct bpf_map *bpf_prog_arena(struct bpf_prog *prog);
int bpf_obj_name_cpy(char *dst, const char *src, unsigned int size);
struct bpf_offload_dev;
diff --git a/kernel/bpf/arena.c b/kernel/bpf/arena.c
index 8eb7b95f4999..5a82d39e1916 100644
--- a/kernel/bpf/arena.c
+++ b/kernel/bpf/arena.c
@@ -84,6 +84,32 @@ u64 bpf_arena_get_user_vm_start(struct bpf_arena *arena)
return arena ? arena->user_vm_start : 0;
}
+/**
+ * bpf_arena_map_kern_vm_start - kern_vm_start lookup by struct bpf_map *
+ * @map: a BPF_MAP_TYPE_ARENA map
+ *
+ * Return @map's kern_vm_start.
+ */
+u64 bpf_arena_map_kern_vm_start(struct bpf_map *map)
+{
+ return bpf_arena_get_kern_vm_start(container_of(map, struct bpf_arena, map));
+}
+
+/**
+ * bpf_prog_arena - return the bpf_map of the arena referenced by @prog
+ * @prog: a loaded BPF program
+ *
+ * The verifier enforces at most one arena per program and stores it in
+ * prog->aux->arena. Return that arena's underlying bpf_map, or NULL if
+ * @prog does not reference an arena.
+ */
+struct bpf_map *bpf_prog_arena(struct bpf_prog *prog)
+{
+ struct bpf_arena *arena = prog->aux->arena;
+
+ return arena ? &arena->map : NULL;
+}
+
static long arena_map_peek_elem(struct bpf_map *map, void *value)
{
return -EOPNOTSUPP;
--
2.54.0
^ permalink raw reply related [flat|nested] 18+ messages in thread* [PATCH 6/8] sched_ext: Require an arena for cid-form schedulers
2026-05-17 21:12 [PATCHSET v2 sched_ext/for-7.2] bpf/arena: Direct kernel-side access Tejun Heo
` (4 preceding siblings ...)
2026-05-17 21:12 ` [PATCH 5/8] bpf/arena: Add bpf_arena_map_kern_vm_start() and bpf_prog_arena() Tejun Heo
@ 2026-05-17 21:12 ` Tejun Heo
2026-05-17 21:12 ` [PATCH 7/8] sched_ext: Sub-allocator over kernel-claimed BPF arena pages Tejun Heo
2026-05-17 21:12 ` [PATCH 8/8] sched_ext: Convert ops.set_cmask() to arena-resident cmask Tejun Heo
7 siblings, 0 replies; 18+ messages in thread
From: Tejun Heo @ 2026-05-17 21:12 UTC (permalink / raw)
To: David Vernet, Andrea Righi, Changwoo Min, Alexei Starovoitov,
Andrii Nakryiko, Daniel Borkmann, Martin KaFai Lau,
Kumar Kartikeya Dwivedi
Cc: Catalin Marinas, Will Deacon, Thomas Gleixner, Ingo Molnar,
Borislav Petkov, Dave Hansen, Andrew Morton, David Hildenbrand,
Mike Rapoport, Emil Tsalapatis, sched-ext, bpf, x86,
linux-arm-kernel, linux-mm, linux-kernel, Tejun Heo
Upcoming patches will let the kernel place arena-resident scratch shared
with the BPF program (e.g. per-CPU set_cmask cmask) so the BPF side can
dereference it directly via __arena pointers, replacing the current
cmask_copy_from_kernel() probe-read loop. That requires each cid-form
scheduler to expose its arena to the kernel. Kernel- side accesses are
recovered by the per-arena scratch-page mechanism.
bpf_scx_reg_cid() walks the struct_ops member progs via
bpf_struct_ops_for_each_prog() and reads each prog's arena via
bpf_prog_arena(). The verifier enforces one arena per program, so each
member prog contributes at most one arena. All non-NULL contributions must
match and at least one member prog must use an arena. The map ref is held on
scx_sched and dropped on sched destroy. cpu-form schedulers (bpf_scx_reg)
are unchanged - no arena requirement.
Signed-off-by: Tejun Heo <tj@kernel.org>
---
kernel/sched/ext.c | 56 ++++++++++++++++++++++++++++++++++++-
kernel/sched/ext_internal.h | 8 ++++++
2 files changed, 63 insertions(+), 1 deletion(-)
diff --git a/kernel/sched/ext.c b/kernel/sched/ext.c
index 64f8a096f133..94aab7037329 100644
--- a/kernel/sched/ext.c
+++ b/kernel/sched/ext.c
@@ -5009,6 +5009,8 @@ static void scx_sched_free_rcu_work(struct work_struct *work)
rhashtable_free_and_destroy(&sch->dsq_hash, NULL, NULL);
free_exit_info(sch->exit_info);
+ if (sch->arena_map)
+ bpf_map_put(sch->arena_map);
kfree(sch);
}
@@ -6732,6 +6734,7 @@ struct scx_enable_cmd {
struct sched_ext_ops_cid *ops_cid;
};
bool is_cid_type;
+ struct bpf_map *arena_map; /* arena ref to transfer to sch */
int ret;
};
@@ -6898,6 +6901,15 @@ static struct scx_sched *scx_alloc_and_add_sched(struct scx_enable_cmd *cmd,
return ERR_PTR(ret);
}
#endif /* CONFIG_EXT_SUB_SCHED */
+
+ /*
+ * Consume the arena_map ref bpf_scx_reg_cid() took. Defer to here so
+ * earlier failure paths leave cmd->arena_map set and bpf_scx_reg_cid
+ * drops the ref. After this point, sch owns the ref and any cleanup
+ * runs through scx_sched_free_rcu_work() which puts it.
+ */
+ sch->arena_map = cmd->arena_map;
+ cmd->arena_map = NULL;
return sch;
#ifdef CONFIG_EXT_SUB_SCHED
@@ -7884,11 +7896,53 @@ static int bpf_scx_reg(void *kdata, struct bpf_link *link)
return scx_enable(&cmd, link);
}
+struct scx_arena_scan {
+ struct bpf_map *arena;
+ int err;
+};
+
+/*
+ * The verifier enforces one arena per BPF program, so each struct_ops
+ * member prog contributes at most one arena via bpf_prog_arena().
+ * Require all non-NULL contributions to match.
+ */
+static int scx_arena_scan_prog(struct bpf_prog *prog, void *data)
+{
+ struct scx_arena_scan *s = data;
+ struct bpf_map *arena = bpf_prog_arena(prog);
+
+ if (!arena)
+ return 0;
+ if (s->arena && s->arena != arena) {
+ s->err = -EINVAL;
+ return 1;
+ }
+ s->arena = arena;
+ return 0;
+}
+
static int bpf_scx_reg_cid(void *kdata, struct bpf_link *link)
{
struct scx_enable_cmd cmd = { .ops_cid = kdata, .is_cid_type = true };
+ struct scx_arena_scan scan = {};
+ int ret;
- return scx_enable(&cmd, link);
+ bpf_struct_ops_for_each_prog(kdata, scx_arena_scan_prog, &scan);
+ if (scan.err) {
+ pr_err("sched_ext: cid-form scheduler uses multiple arena maps\n");
+ return scan.err;
+ }
+ if (!scan.arena) {
+ pr_err("sched_ext: cid-form scheduler must use a BPF arena map\n");
+ return -EINVAL;
+ }
+
+ bpf_map_inc(scan.arena);
+ cmd.arena_map = scan.arena;
+ ret = scx_enable(&cmd, link);
+ if (cmd.arena_map) /* not consumed by scx_alloc_and_add_sched() */
+ bpf_map_put(cmd.arena_map);
+ return ret;
}
static void bpf_scx_unreg(void *kdata, struct bpf_link *link)
diff --git a/kernel/sched/ext_internal.h b/kernel/sched/ext_internal.h
index 7258aea94b9f..d40cfd29ddaa 100644
--- a/kernel/sched/ext_internal.h
+++ b/kernel/sched/ext_internal.h
@@ -1111,6 +1111,14 @@ struct scx_sched {
struct sched_ext_ops_cid ops_cid;
};
bool is_cid_type; /* true if registered via bpf_sched_ext_ops_cid */
+
+ /*
+ * Arena map auto-discovered from member progs at struct_ops attach.
+ * cid-form schedulers must use exactly one arena across all member
+ * progs. NULL on cpu-form.
+ */
+ struct bpf_map *arena_map;
+
DECLARE_BITMAP(has_op, SCX_OPI_END);
/*
--
2.54.0
^ permalink raw reply related [flat|nested] 18+ messages in thread* [PATCH 7/8] sched_ext: Sub-allocator over kernel-claimed BPF arena pages
2026-05-17 21:12 [PATCHSET v2 sched_ext/for-7.2] bpf/arena: Direct kernel-side access Tejun Heo
` (5 preceding siblings ...)
2026-05-17 21:12 ` [PATCH 6/8] sched_ext: Require an arena for cid-form schedulers Tejun Heo
@ 2026-05-17 21:12 ` Tejun Heo
2026-05-18 7:20 ` Peter Zijlstra
2026-05-17 21:12 ` [PATCH 8/8] sched_ext: Convert ops.set_cmask() to arena-resident cmask Tejun Heo
7 siblings, 1 reply; 18+ messages in thread
From: Tejun Heo @ 2026-05-17 21:12 UTC (permalink / raw)
To: David Vernet, Andrea Righi, Changwoo Min, Alexei Starovoitov,
Andrii Nakryiko, Daniel Borkmann, Martin KaFai Lau,
Kumar Kartikeya Dwivedi
Cc: Catalin Marinas, Will Deacon, Thomas Gleixner, Ingo Molnar,
Borislav Petkov, Dave Hansen, Andrew Morton, David Hildenbrand,
Mike Rapoport, Emil Tsalapatis, sched-ext, bpf, x86,
linux-arm-kernel, linux-mm, linux-kernel, Tejun Heo
Build a per-scheduler sub-allocator on top of pages claimed from the BPF
arena registered in the previous patch. Subsequent kernel-managed
arena-resident structures (e.g. per-CPU set_cmask cmask) carve their storage
from this pool.
scx_arena_pool_init() creates a gen_pool. scx_arena_alloc() returns the
kernel VA. On exhaustion, the pool grows by claiming more pages via
bpf_arena_alloc_pages_sleepable(). Chunks are added at the kernel-side
mapping address; callers translate to the BPF-arena form themselves if
needed.
Allocations sleep (GFP_KERNEL) - they may grow the pool through vzalloc and
arena page allocation. All current consumers run from the enable path (after
ops.init() and the kernel-side arena auto-discovery, before validate_ops()),
where sleeping is fine.
scx_arena_pool_destroy() walks each chunk, returns outstanding ranges to the
gen_pool with gen_pool_free() and then calls gen_pool_destroy(). The
underlying arena pages are released when the arena map itself is torn down,
so the pool destroy doesn't free them explicitly.
Signed-off-by: Tejun Heo <tj@kernel.org>
---
kernel/sched/build_policy.c | 4 ++
kernel/sched/ext.c | 11 ++++
kernel/sched/ext_arena.c | 127 ++++++++++++++++++++++++++++++++++++
kernel/sched/ext_arena.h | 18 +++++
kernel/sched/ext_internal.h | 5 ++
5 files changed, 165 insertions(+)
create mode 100644 kernel/sched/ext_arena.c
create mode 100644 kernel/sched/ext_arena.h
diff --git a/kernel/sched/build_policy.c b/kernel/sched/build_policy.c
index 5e76c9177d54..067979a7b69e 100644
--- a/kernel/sched/build_policy.c
+++ b/kernel/sched/build_policy.c
@@ -59,12 +59,16 @@
#ifdef CONFIG_SCHED_CLASS_EXT
# include <linux/btf_ids.h>
+# include <linux/find.h>
+# include <linux/genalloc.h>
# include "ext_types.h"
# include "ext_internal.h"
# include "ext_cid.h"
+# include "ext_arena.h"
# include "ext_idle.h"
# include "ext.c"
# include "ext_cid.c"
+# include "ext_arena.c"
# include "ext_idle.c"
#endif
diff --git a/kernel/sched/ext.c b/kernel/sched/ext.c
index 94aab7037329..3025fbe198d3 100644
--- a/kernel/sched/ext.c
+++ b/kernel/sched/ext.c
@@ -5009,6 +5009,7 @@ static void scx_sched_free_rcu_work(struct work_struct *work)
rhashtable_free_and_destroy(&sch->dsq_hash, NULL, NULL);
free_exit_info(sch->exit_info);
+ scx_arena_pool_destroy(sch);
if (sch->arena_map)
bpf_map_put(sch->arena_map);
kfree(sch);
@@ -7140,6 +7141,12 @@ static void scx_root_enable_workfn(struct kthread_work *work)
sch->exit_info->flags |= SCX_EFLAG_INITIALIZED;
}
+ ret = scx_arena_pool_init(sch);
+ if (ret) {
+ cpus_read_unlock();
+ goto err_disable;
+ }
+
for (i = SCX_OPI_CPU_HOTPLUG_BEGIN; i < SCX_OPI_CPU_HOTPLUG_END; i++)
if (((void (**)(void))ops)[i])
set_bit(i, sch->has_op);
@@ -7458,6 +7465,10 @@ static void scx_sub_enable_workfn(struct kthread_work *work)
sch->exit_info->flags |= SCX_EFLAG_INITIALIZED;
}
+ ret = scx_arena_pool_init(sch);
+ if (ret)
+ goto err_disable;
+
if (validate_ops(sch, ops))
goto err_disable;
diff --git a/kernel/sched/ext_arena.c b/kernel/sched/ext_arena.c
new file mode 100644
index 000000000000..53174033765d
--- /dev/null
+++ b/kernel/sched/ext_arena.c
@@ -0,0 +1,127 @@
+/* SPDX-License-Identifier: GPL-2.0 */
+/*
+ * BPF extensible scheduler class: Documentation/scheduler/sched-ext.rst
+ *
+ * scx_arena_pool: kernel-side sub-allocator over BPF-arena pages.
+ *
+ * Each chunk added to @sch->arena_pool comes from one
+ * bpf_arena_alloc_pages_sleepable() call and is registered at the
+ * kernel-side mapping address. Callers translate to the BPF-arena form
+ * themselves if needed.
+ *
+ * Allocations grow the pool on demand. Underlying arena pages are released
+ * when the arena map itself is torn down.
+ *
+ * Copyright (c) 2026 Meta Platforms, Inc. and affiliates.
+ * Copyright (c) 2026 Tejun Heo <tj@kernel.org>
+ */
+
+enum scx_arena_consts {
+ SCX_ARENA_MIN_ORDER = 3, /* 8-byte minimum sub-allocation */
+ SCX_ARENA_GROW_PAGES = 4, /* per growth */
+};
+
+s32 scx_arena_pool_init(struct scx_sched *sch)
+{
+ if (!sch->arena_map)
+ return 0;
+
+ sch->arena_pool = gen_pool_create(SCX_ARENA_MIN_ORDER, NUMA_NO_NODE);
+ if (!sch->arena_pool)
+ return -ENOMEM;
+ return 0;
+}
+
+static void scx_arena_clear_chunk(struct gen_pool *pool, struct gen_pool_chunk *chunk,
+ void *data)
+{
+ int order = pool->min_alloc_order;
+ size_t chunk_sz = chunk->end_addr - chunk->start_addr + 1;
+ unsigned long end_bit = chunk_sz >> order;
+ unsigned long b, e;
+
+ for_each_set_bitrange(b, e, chunk->bits, end_bit)
+ gen_pool_free(pool, chunk->start_addr + (b << order),
+ (e - b) << order);
+}
+
+/*
+ * Tear down the pool. Outstanding gen_pool allocations are freed via
+ * scx_arena_clear_chunk() so gen_pool_destroy() doesn't BUG. The underlying
+ * arena pages are released when the arena map itself is torn down.
+ */
+void scx_arena_pool_destroy(struct scx_sched *sch)
+{
+ if (!sch->arena_pool)
+ return;
+ gen_pool_for_each_chunk(sch->arena_pool, scx_arena_clear_chunk, NULL);
+ gen_pool_destroy(sch->arena_pool);
+ sch->arena_pool = NULL;
+}
+
+/*
+ * Grow the pool by @page_cnt pages. bpf_arena_alloc_pages_sleepable() and
+ * gen_pool_add() (which calls vzalloc(GFP_KERNEL)) require a sleepable
+ * context.
+ */
+static int scx_arena_grow(struct scx_sched *sch, u32 page_cnt)
+{
+ u64 kern_vm_start;
+ u32 uaddr32;
+ void *p;
+ int ret;
+
+ if (!sch->arena_map || !sch->arena_pool)
+ return -EINVAL;
+
+ p = bpf_arena_alloc_pages_sleepable(sch->arena_map, NULL,
+ page_cnt, NUMA_NO_NODE, 0);
+ if (!p)
+ return -ENOMEM;
+
+ uaddr32 = (u32)(unsigned long)p;
+ kern_vm_start = bpf_arena_map_kern_vm_start(sch->arena_map);
+
+ ret = gen_pool_add(sch->arena_pool, kern_vm_start + uaddr32,
+ page_cnt * PAGE_SIZE, NUMA_NO_NODE);
+ if (ret) {
+ bpf_arena_free_pages_non_sleepable(sch->arena_map, p, page_cnt);
+ return ret;
+ }
+ return 0;
+}
+
+/*
+ * Allocate @size bytes from the arena pool. Returns kernel VA on success, NULL
+ * on failure. May grow the pool via scx_arena_grow() which sleeps. Caller must
+ * be in a GFP_KERNEL context.
+ */
+void *scx_arena_alloc(struct scx_sched *sch, size_t size)
+{
+ unsigned long kern_va;
+ u32 page_cnt;
+
+ might_sleep();
+
+ if (!sch->arena_pool)
+ return NULL;
+
+ kern_va = gen_pool_alloc(sch->arena_pool, size);
+ if (!kern_va) {
+ page_cnt = max_t(u32, SCX_ARENA_GROW_PAGES,
+ (size + PAGE_SIZE - 1) >> PAGE_SHIFT);
+ if (scx_arena_grow(sch, page_cnt))
+ return NULL;
+ kern_va = gen_pool_alloc(sch->arena_pool, size);
+ if (!kern_va)
+ return NULL;
+ }
+
+ return (void *)kern_va;
+}
+
+void scx_arena_free(struct scx_sched *sch, void *kern_va, size_t size)
+{
+ if (sch->arena_pool && kern_va)
+ gen_pool_free(sch->arena_pool, (unsigned long)kern_va, size);
+}
diff --git a/kernel/sched/ext_arena.h b/kernel/sched/ext_arena.h
new file mode 100644
index 000000000000..4f3610160102
--- /dev/null
+++ b/kernel/sched/ext_arena.h
@@ -0,0 +1,18 @@
+/* SPDX-License-Identifier: GPL-2.0 */
+/*
+ * BPF extensible scheduler class: Documentation/scheduler/sched-ext.rst
+ *
+ * Copyright (c) 2025 Meta Platforms, Inc. and affiliates.
+ * Copyright (c) 2025 Tejun Heo <tj@kernel.org>
+ */
+#ifndef _KERNEL_SCHED_EXT_ARENA_H
+#define _KERNEL_SCHED_EXT_ARENA_H
+
+struct scx_sched;
+
+s32 scx_arena_pool_init(struct scx_sched *sch);
+void scx_arena_pool_destroy(struct scx_sched *sch);
+void *scx_arena_alloc(struct scx_sched *sch, size_t size);
+void scx_arena_free(struct scx_sched *sch, void *kern_va, size_t size);
+
+#endif /* _KERNEL_SCHED_EXT_ARENA_H */
diff --git a/kernel/sched/ext_internal.h b/kernel/sched/ext_internal.h
index d40cfd29ddaa..ff7e882bd67a 100644
--- a/kernel/sched/ext_internal.h
+++ b/kernel/sched/ext_internal.h
@@ -1116,8 +1116,13 @@ struct scx_sched {
* Arena map auto-discovered from member progs at struct_ops attach.
* cid-form schedulers must use exactly one arena across all member
* progs. NULL on cpu-form.
+ *
+ * @arena_pool sub-allocates @arena_map. Each gen_pool chunk is added
+ * at the kernel-side mapping address. Grows on demand and pages are
+ * not released until sched destroy.
*/
struct bpf_map *arena_map;
+ struct gen_pool *arena_pool;
DECLARE_BITMAP(has_op, SCX_OPI_END);
--
2.54.0
^ permalink raw reply related [flat|nested] 18+ messages in thread* Re: [PATCH 7/8] sched_ext: Sub-allocator over kernel-claimed BPF arena pages
2026-05-17 21:12 ` [PATCH 7/8] sched_ext: Sub-allocator over kernel-claimed BPF arena pages Tejun Heo
@ 2026-05-18 7:20 ` Peter Zijlstra
2026-05-18 19:51 ` Tejun Heo
0 siblings, 1 reply; 18+ messages in thread
From: Peter Zijlstra @ 2026-05-18 7:20 UTC (permalink / raw)
To: Tejun Heo
Cc: David Vernet, Andrea Righi, Changwoo Min, Alexei Starovoitov,
Andrii Nakryiko, Daniel Borkmann, Martin KaFai Lau,
Kumar Kartikeya Dwivedi, Catalin Marinas, Will Deacon,
Thomas Gleixner, Ingo Molnar, Borislav Petkov, Dave Hansen,
Andrew Morton, David Hildenbrand, Mike Rapoport, Emil Tsalapatis,
sched-ext, bpf, x86, linux-arm-kernel, linux-mm, linux-kernel
On Sun, May 17, 2026 at 11:12:31AM -1000, Tejun Heo wrote:
> Build a per-scheduler sub-allocator on top of pages claimed from the BPF
> arena registered in the previous patch. Subsequent kernel-managed
> arena-resident structures (e.g. per-CPU set_cmask cmask) carve their storage
> from this pool.
>
> scx_arena_pool_init() creates a gen_pool. scx_arena_alloc() returns the
> kernel VA. On exhaustion, the pool grows by claiming more pages via
> bpf_arena_alloc_pages_sleepable(). Chunks are added at the kernel-side
> mapping address; callers translate to the BPF-arena form themselves if
> needed.
>
> Allocations sleep (GFP_KERNEL) - they may grow the pool through vzalloc and
> arena page allocation. All current consumers run from the enable path (after
> ops.init() and the kernel-side arena auto-discovery, before validate_ops()),
> where sleeping is fine.
>
> scx_arena_pool_destroy() walks each chunk, returns outstanding ranges to the
> gen_pool with gen_pool_free() and then calls gen_pool_destroy(). The
> underlying arena pages are released when the arena map itself is torn down,
> so the pool destroy doesn't free them explicitly.
Should this really be part of scx rather than be part of the bpf-arena
thing proper?
^ permalink raw reply [flat|nested] 18+ messages in thread
* Re: [PATCH 7/8] sched_ext: Sub-allocator over kernel-claimed BPF arena pages
2026-05-18 7:20 ` Peter Zijlstra
@ 2026-05-18 19:51 ` Tejun Heo
2026-05-18 23:26 ` Alexei Starovoitov
0 siblings, 1 reply; 18+ messages in thread
From: Tejun Heo @ 2026-05-18 19:51 UTC (permalink / raw)
To: Peter Zijlstra
Cc: David Vernet, Andrea Righi, Changwoo Min, Alexei Starovoitov,
Andrii Nakryiko, Daniel Borkmann, Martin KaFai Lau,
Kumar Kartikeya Dwivedi, Catalin Marinas, Will Deacon,
Thomas Gleixner, Ingo Molnar, Borislav Petkov, Dave Hansen,
Andrew Morton, David Hildenbrand, Mike Rapoport, Emil Tsalapatis,
sched-ext, bpf, x86, linux-arm-kernel, linux-mm, linux-kernel
Hello,
On Mon, May 18, 2026 at 09:20:42AM +0200, Peter Zijlstra wrote:
...
> Should this really be part of scx rather than be part of the bpf-arena
> thing proper?
It's just a layer on top of arena. If bpf folks are okay with it, I don't
see why it can't be a common utility thing on the bpf side.
Thanks.
--
tejun
^ permalink raw reply [flat|nested] 18+ messages in thread
* Re: [PATCH 7/8] sched_ext: Sub-allocator over kernel-claimed BPF arena pages
2026-05-18 19:51 ` Tejun Heo
@ 2026-05-18 23:26 ` Alexei Starovoitov
0 siblings, 0 replies; 18+ messages in thread
From: Alexei Starovoitov @ 2026-05-18 23:26 UTC (permalink / raw)
To: Tejun Heo, Peter Zijlstra
Cc: David Vernet, Andrea Righi, Changwoo Min, Alexei Starovoitov,
Andrii Nakryiko, Daniel Borkmann, Martin KaFai Lau,
Kumar Kartikeya Dwivedi, Catalin Marinas, Will Deacon,
Thomas Gleixner, Ingo Molnar, Borislav Petkov, Dave Hansen,
Andrew Morton, David Hildenbrand, Mike Rapoport, Emil Tsalapatis,
sched-ext, bpf, x86, linux-arm-kernel, linux-mm, linux-kernel
On Mon May 18, 2026 at 12:51 PM PDT, Tejun Heo wrote:
> Hello,
>
> On Mon, May 18, 2026 at 09:20:42AM +0200, Peter Zijlstra wrote:
> ...
>> Should this really be part of scx rather than be part of the bpf-arena
>> thing proper?
>
> It's just a layer on top of arena. If bpf folks are okay with it, I don't
> see why it can't be a common utility thing on the bpf side.
Well, this gen_pool based allocator of arena memory is a temporary hack.
It's ok for rare allocation like in this at scx init time, but not suitable
for active arena management. We don't need to expose it beyond scx.
Having said that the fast and generic allocator for arena is definitely needed.
This break through with scratch page and bpf_arena_handle_page_fault()
cannot be overstated. It is a huge improvement for kernel <-> bpf interaction.
Not only kfuncs can now read arena without ugly __get_kernel_nofault(),
but we can reuse mm/slub.c to manage arena memory!
The key idea is simply this:
get_freepointer() {
if (s->flags & SLAB_BPF_ARENA)
return (void *)(s->arena_kern_vm_start | (u32)(unsigned long)ptr);
}
I'm sloping a prototype.
^ permalink raw reply [flat|nested] 18+ messages in thread
* [PATCH 8/8] sched_ext: Convert ops.set_cmask() to arena-resident cmask
2026-05-17 21:12 [PATCHSET v2 sched_ext/for-7.2] bpf/arena: Direct kernel-side access Tejun Heo
` (6 preceding siblings ...)
2026-05-17 21:12 ` [PATCH 7/8] sched_ext: Sub-allocator over kernel-claimed BPF arena pages Tejun Heo
@ 2026-05-17 21:12 ` Tejun Heo
7 siblings, 0 replies; 18+ messages in thread
From: Tejun Heo @ 2026-05-17 21:12 UTC (permalink / raw)
To: David Vernet, Andrea Righi, Changwoo Min, Alexei Starovoitov,
Andrii Nakryiko, Daniel Borkmann, Martin KaFai Lau,
Kumar Kartikeya Dwivedi
Cc: Catalin Marinas, Will Deacon, Thomas Gleixner, Ingo Molnar,
Borislav Petkov, Dave Hansen, Andrew Morton, David Hildenbrand,
Mike Rapoport, Emil Tsalapatis, sched-ext, bpf, x86,
linux-arm-kernel, linux-mm, linux-kernel, Tejun Heo
ops_cid.set_cmask() expects a cmask. The kernel couldn't write into the
arena, so it translated cpumask -> cmask in kernel memory and passed the
result as a trusted pointer. The BPF cmask helpers all operate on arena
cmasks though, so the BPF side had to word-by-word probe-read the kernel
cmask into an arena cmask via cmask_copy_from_kernel() before any helper
could touch it. It works, but is clumsy.
With direct kernel-side arena access now in place, build the cmask in the
arena. The kernel writes to it through the kern_va side of the dual mapping;
BPF directly dereferences it via an __arena pointer like any other arena
struct.
Signed-off-by: Tejun Heo <tj@kernel.org>
---
kernel/sched/ext.c | 68 +++++++++++++++++++++++++--
kernel/sched/ext_cid.c | 20 +-------
kernel/sched/ext_internal.h | 10 +++-
tools/sched_ext/include/scx/cid.bpf.h | 52 --------------------
tools/sched_ext/scx_qmap.bpf.c | 5 +-
5 files changed, 75 insertions(+), 80 deletions(-)
diff --git a/kernel/sched/ext.c b/kernel/sched/ext.c
index 3025fbe198d3..1369dc7e4b4e 100644
--- a/kernel/sched/ext.c
+++ b/kernel/sched/ext.c
@@ -621,11 +621,16 @@ static inline void scx_call_op_set_cpumask(struct scx_sched *sch, struct rq *rq,
update_locked_rq(rq);
if (scx_is_cid_type()) {
- struct scx_cmask *cmask = this_cpu_ptr(scx_set_cmask_scratch);
-
- lockdep_assert_irqs_disabled();
- scx_cpumask_to_cmask(cpumask, cmask);
- sch->ops_cid.set_cmask(task, cmask);
+ struct scx_cmask *kern_va = *this_cpu_ptr(sch->set_cmask_scratch);
+ unsigned long uaddr = (unsigned long)kern_va -
+ bpf_arena_map_kern_vm_start(sch->arena_map);
+ /*
+ * Build the per-CPU arena cmask and hand BPF the uaddr. Caller
+ * holds the rq lock with IRQs disabled, which makes us the sole
+ * user of the scratch area.
+ */
+ scx_cpumask_to_cmask(cpumask, kern_va);
+ sch->ops_cid.set_cmask(task, (struct scx_cmask *)uaddr);
} else {
sch->ops.set_cpumask(task, cpumask);
}
@@ -4957,6 +4962,48 @@ static const struct attribute_group scx_global_attr_group = {
static void free_pnode(struct scx_sched_pnode *pnode);
static void free_exit_info(struct scx_exit_info *ei);
+static s32 scx_set_cmask_scratch_alloc(struct scx_sched *sch)
+{
+ size_t size = struct_size_t(struct scx_cmask, bits,
+ SCX_CMASK_NR_WORDS(num_possible_cpus()));
+ int cpu;
+
+ if (!sch->is_cid_type || !sch->arena_pool)
+ return 0;
+
+ sch->set_cmask_scratch = alloc_percpu(struct scx_cmask *);
+ if (!sch->set_cmask_scratch)
+ return -ENOMEM;
+
+ for_each_possible_cpu(cpu) {
+ struct scx_cmask **slot = per_cpu_ptr(sch->set_cmask_scratch, cpu);
+
+ *slot = scx_arena_alloc(sch, size);
+ if (!*slot)
+ return -ENOMEM;
+ scx_cmask_init(*slot, 0, num_possible_cpus());
+ }
+ return 0;
+}
+
+static void scx_set_cmask_scratch_free(struct scx_sched *sch)
+{
+ size_t size = struct_size_t(struct scx_cmask, bits,
+ SCX_CMASK_NR_WORDS(num_possible_cpus()));
+ int cpu;
+
+ if (!sch->set_cmask_scratch)
+ return;
+
+ for_each_possible_cpu(cpu) {
+ struct scx_cmask **slot = per_cpu_ptr(sch->set_cmask_scratch, cpu);
+
+ scx_arena_free(sch, *slot, size);
+ }
+ free_percpu(sch->set_cmask_scratch);
+ sch->set_cmask_scratch = NULL;
+}
+
static void scx_sched_free_rcu_work(struct work_struct *work)
{
struct rcu_work *rcu_work = to_rcu_work(work);
@@ -5009,6 +5056,7 @@ static void scx_sched_free_rcu_work(struct work_struct *work)
rhashtable_free_and_destroy(&sch->dsq_hash, NULL, NULL);
free_exit_info(sch->exit_info);
+ scx_set_cmask_scratch_free(sch);
scx_arena_pool_destroy(sch);
if (sch->arena_map)
bpf_map_put(sch->arena_map);
@@ -7147,6 +7195,12 @@ static void scx_root_enable_workfn(struct kthread_work *work)
goto err_disable;
}
+ ret = scx_set_cmask_scratch_alloc(sch);
+ if (ret) {
+ cpus_read_unlock();
+ goto err_disable;
+ }
+
for (i = SCX_OPI_CPU_HOTPLUG_BEGIN; i < SCX_OPI_CPU_HOTPLUG_END; i++)
if (((void (**)(void))ops)[i])
set_bit(i, sch->has_op);
@@ -7469,6 +7523,10 @@ static void scx_sub_enable_workfn(struct kthread_work *work)
if (ret)
goto err_disable;
+ ret = scx_set_cmask_scratch_alloc(sch);
+ if (ret)
+ goto err_disable;
+
if (validate_ops(sch, ops))
goto err_disable;
diff --git a/kernel/sched/ext_cid.c b/kernel/sched/ext_cid.c
index 5cd14143f88f..245a39e2e5eb 100644
--- a/kernel/sched/ext_cid.c
+++ b/kernel/sched/ext_cid.c
@@ -7,14 +7,6 @@
*/
#include <linux/cacheinfo.h>
-/*
- * Per-cpu scratch cmask used by scx_call_op_set_cpumask() to synthesize a
- * cmask from a cpumask. Allocated alongside the cid arrays on first enable
- * and never freed. Sized to the full cid space. Caller holds rq lock so
- * this_cpu_ptr is safe.
- */
-struct scx_cmask __percpu *scx_set_cmask_scratch;
-
/*
* cid tables.
*
@@ -54,8 +46,6 @@ static s32 scx_cid_arrays_alloc(void)
u32 npossible = num_possible_cpus();
s16 *cid_to_cpu, *cpu_to_cid;
struct scx_cid_topo *cid_topo;
- struct scx_cmask __percpu *set_cmask_scratch;
- s32 cpu;
if (scx_cid_to_cpu_tbl)
return 0;
@@ -63,25 +53,17 @@ static s32 scx_cid_arrays_alloc(void)
cid_to_cpu = kzalloc_objs(*scx_cid_to_cpu_tbl, npossible, GFP_KERNEL);
cpu_to_cid = kzalloc_objs(*scx_cpu_to_cid_tbl, nr_cpu_ids, GFP_KERNEL);
cid_topo = kmalloc_objs(*scx_cid_topo, npossible, GFP_KERNEL);
- set_cmask_scratch = __alloc_percpu(struct_size(set_cmask_scratch, bits,
- SCX_CMASK_NR_WORDS(npossible)),
- sizeof(u64));
- if (!cid_to_cpu || !cpu_to_cid || !cid_topo || !set_cmask_scratch) {
+ if (!cid_to_cpu || !cpu_to_cid || !cid_topo) {
kfree(cid_to_cpu);
kfree(cpu_to_cid);
kfree(cid_topo);
- free_percpu(set_cmask_scratch);
return -ENOMEM;
}
WRITE_ONCE(scx_cid_to_cpu_tbl, cid_to_cpu);
WRITE_ONCE(scx_cpu_to_cid_tbl, cpu_to_cid);
WRITE_ONCE(scx_cid_topo, cid_topo);
- for_each_possible_cpu(cpu)
- scx_cmask_init(per_cpu_ptr(set_cmask_scratch, cpu),
- 0, npossible);
- WRITE_ONCE(scx_set_cmask_scratch, set_cmask_scratch);
return 0;
}
diff --git a/kernel/sched/ext_internal.h b/kernel/sched/ext_internal.h
index ff7e882bd67a..9bb65367f510 100644
--- a/kernel/sched/ext_internal.h
+++ b/kernel/sched/ext_internal.h
@@ -1124,6 +1124,14 @@ struct scx_sched {
struct bpf_map *arena_map;
struct gen_pool *arena_pool;
+ /*
+ * Per-CPU arena cmask used by scx_call_op_set_cpumask() to hand a cmask
+ * to ops_cid.set_cmask(). The kernel writes through the stored kern_va;
+ * the BPF-arena uaddr handed to BPF is recovered by subtracting the
+ * arena's kern_vm_start.
+ */
+ struct scx_cmask * __percpu *set_cmask_scratch;
+
DECLARE_BITMAP(has_op, SCX_OPI_END);
/*
@@ -1480,8 +1488,6 @@ enum scx_ops_state {
extern struct scx_sched __rcu *scx_root;
DECLARE_PER_CPU(struct rq *, scx_locked_rq_state);
-extern struct scx_cmask __percpu *scx_set_cmask_scratch;
-
/*
* True when the currently loaded scheduler hierarchy is cid-form. All scheds
* in a hierarchy share one form, so this single key tells callsites which
diff --git a/tools/sched_ext/include/scx/cid.bpf.h b/tools/sched_ext/include/scx/cid.bpf.h
index 257d8bdca966..875003f04bdc 100644
--- a/tools/sched_ext/include/scx/cid.bpf.h
+++ b/tools/sched_ext/include/scx/cid.bpf.h
@@ -669,56 +669,4 @@ static __always_inline void cmask_from_cpumask(struct scx_cmask __arena *m,
}
}
-/**
- * cmask_copy_from_kernel - probe-read a kernel cmask into an arena cmask
- * @dst: arena cmask to fill; must have @dst->base == 0 and be sized for @src.
- * @src: kernel-memory cmask (e.g. ops.set_cmask() arg); @src->base must be 0.
- *
- * Word-for-word copy; @src and @dst must share base 0 alignment. Triggers
- * scx_bpf_error() on probe failure or precondition violation.
- */
-static __always_inline void cmask_copy_from_kernel(struct scx_cmask __arena *dst,
- const struct scx_cmask *src)
-{
- u32 base = 0, nr_cids = 0, nr_words, wi;
-
- if (dst->base != 0) {
- scx_bpf_error("cmask_copy_from_kernel requires dst->base == 0");
- return;
- }
-
- if (bpf_probe_read_kernel(&base, sizeof(base), &src->base)) {
- scx_bpf_error("probe-read cmask->base failed");
- return;
- }
- if (base != 0) {
- scx_bpf_error("cmask_copy_from_kernel requires src->base == 0");
- return;
- }
-
- if (bpf_probe_read_kernel(&nr_cids, sizeof(nr_cids), &src->nr_cids)) {
- scx_bpf_error("probe-read cmask->nr_cids failed");
- return;
- }
-
- if (nr_cids > dst->nr_cids) {
- scx_bpf_error("src cmask nr_cids=%u exceeds dst nr_cids=%u",
- nr_cids, dst->nr_cids);
- return;
- }
-
- nr_words = CMASK_NR_WORDS(nr_cids);
- cmask_zero(dst);
- bpf_for(wi, 0, CMASK_MAX_WORDS) {
- u64 word = 0;
- if (wi >= nr_words)
- break;
- if (bpf_probe_read_kernel(&word, sizeof(u64), &src->bits[wi])) {
- scx_bpf_error("probe-read cmask->bits[%u] failed", wi);
- return;
- }
- dst->bits[wi] = word;
- }
-}
-
#endif /* __SCX_CID_BPF_H */
diff --git a/tools/sched_ext/scx_qmap.bpf.c b/tools/sched_ext/scx_qmap.bpf.c
index 7e77f22674ea..8a2d6a8ebd8e 100644
--- a/tools/sched_ext/scx_qmap.bpf.c
+++ b/tools/sched_ext/scx_qmap.bpf.c
@@ -919,14 +919,15 @@ void BPF_STRUCT_OPS(qmap_update_idle, s32 cid, bool idle)
}
void BPF_STRUCT_OPS(qmap_set_cmask, struct task_struct *p,
- const struct scx_cmask *cmask)
+ const struct scx_cmask *cmask_in)
{
+ struct scx_cmask __arena *cmask = (struct scx_cmask __arena *)(long)cmask_in;
task_ctx_t *taskc;
taskc = lookup_task_ctx(p);
if (!taskc)
return;
- cmask_copy_from_kernel(&taskc->cpus_allowed, cmask);
+ cmask_copy(&taskc->cpus_allowed, cmask);
}
struct monitor_timer {
--
2.54.0
^ permalink raw reply related [flat|nested] 18+ messages in thread