[PATCH bpf-next] bpf: Replace scratch PTE atomically when allocating arena pages

BPF List
 help / color / mirror / Atom feed

* [PATCH bpf-next] bpf: Replace scratch PTE atomically when allocating arena pages
@ 2026-06-01 18:37 Tejun Heo
  2026-06-01 18:58 ` sashiko-bot
                   ` (3 more replies)
  0 siblings, 4 replies; 13+ messages in thread
From: Tejun Heo @ 2026-06-01 18:37 UTC (permalink / raw)
  To: void, arighi, changwoo, ast, andrii, daniel, martin.lau, memxor
  Cc: peterz, catalin.marinas, will, tglx, mingo, bp, dave.hansen, akpm,
	david, rppt, emil, sched-ext, bpf, x86, linux-arm-kernel,
	linux-mm, linux-kernel, Tejun Heo

apply_range_set_cb() maps the pages for a new arena allocation and returned
-EBUSY when the target PTE was already populated. Kernel-fault recovery
leaves the per-arena scratch page in unallocated arena PTEs, so a later
bpf_arena_alloc_pages() over such a page hits that -EBUSY, and every
subsequent allocation of it fails the same way. Allocation must install the
real page over scratch instead.

Overwriting the scratch PTE in place is a valid->valid change, which arm64
forbids without break-before-make. Route through an invalid entry instead:
ptep_try_set() fills only a none slot, so the PTE goes scratch->none->page.
On finding scratch, clear it and flush_tlb_before_set() before retrying. The
new flush_tlb_before_set() is a no-op except on arches like arm64 that need
the break-before-make TLB invalidate. The loop also copes with a concurrent
fault re-scratching the slot.

Arches without ptep_try_set() never install the scratch page, so keep the
must-be-empty check and set_pte_at() for them.

Fixes: dc11a4dba246 ("bpf: Recover arena kernel faults with scratch page")
Signed-off-by: Tejun Heo <tj@kernel.org>
Cc: Alexei Starovoitov <ast@kernel.org>
Cc: David Hildenbrand <david@kernel.org>
---

 arch/arm64/include/asm/pgtable.h |   11 +++++++++++
 include/linux/pgtable.h          |   18 ++++++++++++++++++
 kernel/bpf/arena.c               |   38 +++++++++++++++++++++++++++++++++-----
 3 files changed, 62 insertions(+), 5 deletions(-)

diff --git a/arch/arm64/include/asm/pgtable.h b/arch/arm64/include/asm/pgtable.h
index 984f050..3ce0f2a 100644
--- a/arch/arm64/include/asm/pgtable.h
+++ b/arch/arm64/include/asm/pgtable.h
@@ -1842,6 +1842,17 @@ static inline bool ptep_try_set(pte_t *ptep, pte_t new_pte)
 }
 #define ptep_try_set ptep_try_set
 
+/*
+ * arm64 mandates break-before-make: a cleared kernel PTE must have its TLB
+ * invalidated before a different page is installed in its place. The broadcast
+ * TLBI is an instruction, not an IPI, so this is safe with interrupts disabled.
+ */
+static inline void flush_tlb_before_set(unsigned long addr)
+{
+	flush_tlb_kernel_range(addr, addr + PAGE_SIZE);
+}
+#define flush_tlb_before_set flush_tlb_before_set
+
 #define test_and_clear_young_ptes test_and_clear_young_ptes
 static inline bool test_and_clear_young_ptes(struct vm_area_struct *vma,
 		unsigned long addr, pte_t *ptep, unsigned int nr)
diff --git a/include/linux/pgtable.h b/include/linux/pgtable.h
index b5739bb..4c6c408 100644
--- a/include/linux/pgtable.h
+++ b/include/linux/pgtable.h
@@ -1061,6 +1061,24 @@ static inline bool ptep_try_set(pte_t *ptep, pte_t new_pte)
 }
 #endif
 
+#ifndef flush_tlb_before_set
+/**
+ * flush_tlb_before_set - invalidate a kernel PTE's TLB before re-setting it
+ * @addr: kernel virtual address whose PTE was just cleared
+ *
+ * Some architectures (e.g. arm64) do not allow a live page-table entry to be
+ * repointed at a different page in one step. The old entry must first be made
+ * invalid and its translation flushed from every TLB, and only then may the new
+ * entry be written.
+ *
+ * This is only for the lockless atomic kernel-PTE installers (ptep_try_set()).
+ * It must be callable with interrupts disabled.
+ */
+static inline void flush_tlb_before_set(unsigned long addr)
+{
+}
+#endif
+
 #ifndef wrprotect_ptes
 /**
  * wrprotect_ptes - Write-protect PTEs that map consecutive pages of the same
diff --git a/kernel/bpf/arena.c b/kernel/bpf/arena.c
index 1727503..b6ac5a9 100644
--- a/kernel/bpf/arena.c
+++ b/kernel/bpf/arena.c
@@ -142,6 +142,7 @@ static long compute_pgoff(struct bpf_arena *arena, long uaddr)
 
 struct apply_range_data {
 	struct page **pages;
+	struct page *scratch_page;
 	int i;
 };
 
@@ -154,19 +155,44 @@ static int apply_range_set_cb(pte_t *pte, unsigned long addr, void *data)
 {
 	struct apply_range_data *d = data;
 	struct page *page;
+	pte_t pteval;
 
 	if (!data)
 		return 0;
-	/* sanity check */
-	if (unlikely(!pte_none(ptep_get(pte))))
-		return -EBUSY;
 
 	page = d->pages[d->i];
 	/* paranoia, similar to vmap_pages_pte_range() */
 	if (WARN_ON_ONCE(!pfn_valid(page_to_pfn(page))))
 		return -EINVAL;
 
-	set_pte_at(&init_mm, addr, pte, mk_pte(page, PAGE_KERNEL));
+	pteval = mk_pte(page, PAGE_KERNEL);
+#ifdef ptep_try_set
+	/*
+	 * Kernel-fault recovery may have installed the scratch page here, and
+	 * some architectures (arm64) prohibit valid->valid PTE transitions.
+	 * Install atomically into a none slot. If scratch is present, clear it
+	 * and flush_tlb_before_set() (break-before-make) before retrying.
+	 */
+	while (!ptep_try_set(pte, pteval)) {
+		pte_t old = ptep_get(pte);
+
+		if (pte_none(old))
+			continue;
+		if (WARN_ON_ONCE(pte_page(old) != d->scratch_page))
+			return -EBUSY;
+		ptep_get_and_clear(&init_mm, addr, pte);
+		flush_tlb_before_set(addr);
+	}
+#else
+	/*
+	 * Without ptep_try_set() there is no atomic installer, but such arches
+	 * also do not wire up bpf_arena_handle_page_fault(), so no scratch page
+	 * is ever installed and the slot is always none here.
+	 */
+	if (unlikely(!pte_none(ptep_get(pte))))
+		return -EBUSY;
+	set_pte_at(&init_mm, addr, pte, pteval);
+#endif
 	d->i++;
 	return 0;
 }
@@ -475,7 +501,8 @@ static vm_fault_t arena_vm_fault(struct vm_fault *vmf)
 	if (ret)
 		goto out_sigsegv_memcg;
 
-	struct apply_range_data data = { .pages = &page, .i = 0 };
+	struct apply_range_data data = { .pages = &page, .i = 0,
+					 .scratch_page = arena->scratch_page };
 	/* Account into memcg of the process that created bpf_arena */
 	ret = bpf_map_alloc_pages(map, NUMA_NO_NODE, 1, &page);
 	if (ret) {
@@ -665,6 +692,7 @@ static long arena_alloc_pages(struct bpf_arena *arena, long uaddr, long page_cnt
 		return 0;
 	}
 	data.pages = pages;
+	data.scratch_page = arena->scratch_page;
 
 	if (raw_res_spin_lock_irqsave(&arena->spinlock, flags))
 		goto out_free_pages;

^ permalink raw reply related	[flat|nested] 13+ messages in thread

* Re: [PATCH bpf-next] bpf: Replace scratch PTE atomically when allocating arena pages
  2026-06-01 18:37 [PATCH bpf-next] bpf: Replace scratch PTE atomically when allocating arena pages Tejun Heo
@ 2026-06-01 18:58 ` sashiko-bot
  2026-06-01 19:19   ` Tejun Heo
       [not found] ` <f45f02410d23c99d14f3577da9b27a327816b5a0904bdeefc8e229eac760d1cf@mail.kernel.org>
                   ` (2 subsequent siblings)
  3 siblings, 1 reply; 13+ messages in thread
From: sashiko-bot @ 2026-06-01 18:58 UTC (permalink / raw)
  To: Tejun Heo; +Cc: sched-ext, bpf

Thank you for your contribution! Sashiko AI review found 3 potential issue(s) to consider:

New issues:
- [Critical] Replacing the populated scratch page PTE with a newly allocated page lacks a TLB flush on architectures where `flush_tlb_before_set()` is a no-op, causing silent data corruption.
- [High] Calling `pte_page(old)` without checking `pte_present(old)` can cause an invalid pointer dereference or kernel panic.

Pre-existing issues:
- [High] Calling memory allocation and page table walking functions with interrupts disabled causes lock nesting violations and scheduling while atomic bugs.
--

commit b6f9fe34365955853d165d4a39abcf4bf12f7b70
Author: Tejun Heo <tj@kernel.org>

bpf: Replace scratch PTE atomically when allocating arena pages

This commit modifies the BPF arena allocation to replace the shared scratch
page with a newly allocated page. To support architectures like arm64, it
uses ptep_try_set() to do a valid->invalid->valid transition, providing a
new flush_tlb_before_set() hook for the break-before-make sequence. For
architectures without ptep_try_set(), it falls back to checking pte_none()
and setting the PTE directly.

> diff --git a/include/linux/pgtable.h b/include/linux/pgtable.h
> index b5739bb99fc15..4c6c4081ef715 100644
> --- a/include/linux/pgtable.h
> +++ b/include/linux/pgtable.h

[ ... ]

> @@ -1061,6 +1061,24 @@ static inline bool ptep_try_set(pte_t *ptep, pte_t new_pte)
>  }
>  #endif
>  
> +#ifndef flush_tlb_before_set
> +/**
> + * flush_tlb_before_set - invalidate a kernel PTE's TLB before re-setting it
> + * @addr: kernel virtual address whose PTE was just cleared
> + *
> + * Some architectures (e.g. arm64) do not allow a live page-table entry to be
> + * repointed at a different page in one step. The old entry must first be made
> + * invalid and its translation flushed from every TLB, and only then may the new
> + * entry be written.
> + *
> + * This is only for the lockless atomic kernel-PTE installers (ptep_try_set()).
> + * It must be callable with interrupts disabled.
> + */
> +static inline void flush_tlb_before_set(unsigned long addr)
> +{
> +}

[Severity: Critical]
Since the generic implementation of flush_tlb_before_set() is an empty
function, does this leave a stale TLB entry on architectures like x86?

When the scratch page is replaced, ptep_get_and_clear() clears the PTE, and
then ptep_try_set() installs the newly allocated page. If the architecture
uses the generic fallback for flush_tlb_before_set(), there is no TLB flush
before the new entry is written, and the allocation loop doesn't seem to
perform a deferred flush either.

Could this result in BPF programs continuing to read and write the shared
scratch page instead of the new memory?

> +#endif
> +
>  #ifndef wrprotect_ptes
>  /**
>   * wrprotect_ptes - Write-protect PTEs that map consecutive pages of the same
> diff --git a/kernel/bpf/arena.c b/kernel/bpf/arena.c
> index a641b19255aad..799fb49768e69 100644
> --- a/kernel/bpf/arena.c
> +++ b/kernel/bpf/arena.c

[ ... ]

> @@ -154,19 +155,44 @@ static int apply_range_set_cb(pte_t *pte, unsigned long addr, void *data)
>  {
>  	struct apply_range_data *d = data;
>  	struct page *page;
> +	pte_t pteval;
>  
>  	if (!data)
>  		return 0;
>  
>  	page = d->pages[d->i];
>  	/* paranoia, similar to vmap_pages_pte_range() */
>  	if (WARN_ON_ONCE(!pfn_valid(page_to_pfn(page))))
>  		return -EINVAL;
>  
> -	set_pte_at(&init_mm, addr, pte, mk_pte(page, PAGE_KERNEL));
> +	pteval = mk_pte(page, PAGE_KERNEL);
> +#ifdef ptep_try_set
> +	/*
> +	 * Kernel-fault recovery may have installed the scratch page here, and
> +	 * some architectures (arm64) prohibit valid->valid PTE transitions.
> +	 * Install atomically into a none slot. If scratch is present, clear it
> +	 * and flush_tlb_before_set() (break-before-make) before retrying.
> +	 */
> +	while (!ptep_try_set(pte, pteval)) {
> +		pte_t old = ptep_get(pte);
> +
> +		if (pte_none(old))
> +			continue;
> +		if (WARN_ON_ONCE(pte_page(old) != d->scratch_page))

[Severity: High]
Can calling pte_page(old) here without first checking pte_present(old)
cause an invalid pointer dereference?

The function pte_none() only guarantees the PTE is not completely empty, but
it doesn't ensure it is present. If the PTE contains a non-present entry
like a swap entry or a poison marker, pte_page() will interpret the swap
metadata as a physical page frame number.

Additionally, on architectures like x86 with CONFIG_DEBUG_VM enabled,
pte_pfn() asserts pte_present(), which would trigger a kernel panic. Should
this check pte_present(old) before extracting the page?

> +			return -EBUSY;
> +		ptep_get_and_clear(&init_mm, addr, pte);
> +		flush_tlb_before_set(addr);
> +	}

[ ... ]

> @@ -475,7 +501,8 @@ static vm_fault_t arena_vm_fault(struct vm_fault *vmf)
>  	if (ret)
>  		goto out_sigsegv_memcg;
>  
> -	struct apply_range_data data = { .pages = &page, .i = 0 };
> +	struct apply_range_data data = { .pages = &page, .i = 0,
> +					 .scratch_page = arena->scratch_page };
>  	/* Account into memcg of the process that created bpf_arena */
>  	ret = bpf_map_alloc_pages(map, NUMA_NO_NODE, 1, &page);

[Severity: High]
This is a pre-existing issue, but does calling bpf_map_alloc_pages() here
with interrupts disabled cause lock nesting violations or scheduling while
atomic bugs?

Looking at arena_vm_fault(), raw_res_spin_lock_irqsave() is called earlier,
disabling interrupts. Then bpf_map_alloc_pages() is called, followed by
apply_to_page_range(). The apply_to_page_range() function can dynamically
allocate page tables using GFP_KERNEL (which can sleep) and acquires the
page table lock, which is a sleepable rt_mutex on PREEMPT_RT kernels.

A similar pattern exists in arena_alloc_pages(), where
raw_res_spin_lock_irqsave() is taken before the while loop that calls
apply_to_page_range(). Is it safe to call these sleepable functions inside
an IRQ-disabled critical section?

>  	if (ret) {
>  		range_tree_set(&arena->rt, vmf->pgoff, 1);
>  		goto out_sigsegv_memcg;
>  	}

-- 
Sashiko AI review · https://sashiko.dev/#/patchset/20260601183728.1800490-1-tj@kernel.org?part=1

^ permalink raw reply	[flat|nested] 13+ messages in thread

* Re: [PATCH bpf-next] bpf: Replace scratch PTE atomically when allocating arena pages
  2026-06-01 18:58 ` sashiko-bot
@ 2026-06-01 19:19   ` Tejun Heo
  0 siblings, 0 replies; 13+ messages in thread
From: Tejun Heo @ 2026-06-01 19:19 UTC (permalink / raw)
  To: sashiko-bot; +Cc: sched-ext, bpf

Hello,

> [Critical] Replacing the populated scratch page PTE with a newly allocated
> page lacks a TLB flush on architectures where flush_tlb_before_set() is a
> no-op, causing silent data corruption.

By design. A scratch PTE only exists after a program has malfunctioned, and
the contract is just that the kernel does not die. Reading the scratch page
or the new page is equally fine.

> [High] Calling pte_page(old) without checking pte_present(old) can cause an
> invalid pointer dereference or kernel panic.

Same invariant the existing arena code relies on: these init_mm slots only
ever hold none or a present page, so pte_none() is enough.

> [High] Calling memory allocation and page table walking functions with
> interrupts disabled [...]

Pre-existing and unchanged here. Page tables are pre-populated, so
apply_to_page_range() does not allocate.

Thanks.
--
tejun

^ permalink raw reply	[flat|nested] 13+ messages in thread

* Re: [PATCH bpf-next] bpf: Replace scratch PTE atomically when allocating arena pages
       [not found] ` <f45f02410d23c99d14f3577da9b27a327816b5a0904bdeefc8e229eac760d1cf@mail.kernel.org>
@ 2026-06-02 22:09   ` Tejun Heo
  2026-06-06 16:06     ` Catalin Marinas
  0 siblings, 1 reply; 13+ messages in thread
From: Tejun Heo @ 2026-06-02 22:09 UTC (permalink / raw)
  To: bot+bpf-ci, void, arighi, changwoo, ast, andrii, daniel,
	martin.lau, memxor
  Cc: peterz, catalin.marinas, will, tglx, mingo, bp, dave.hansen, akpm,
	david, rppt, emil, sched-ext, bpf, x86, linux-arm-kernel,
	linux-mm, linux-kernel, eddyz87, yonghong.song, clm,
	ihor.solodrai

On Mon, Jun 01, 2026 at 08:15:34PM +0000, bot+bpf-ci@kernel.org wrote:
> After the real page is installed without a flush, can that stale
> kaddr -> scratch_page translation persist, so that later kernel-side
> accesses at kaddr reach the shared per-arena scratch page instead of
> the freshly allocated page?

It can on x86, but it's harmless: that CPU faulted on an unallocated
address and got scratch-recovered, so reaching either the scratch or the
real page is fine. No flush needed.

Thanks.

--
tejun

^ permalink raw reply	[flat|nested] 13+ messages in thread

* Re: [PATCH bpf-next] bpf: Replace scratch PTE atomically when allocating arena pages
  2026-06-01 18:37 [PATCH bpf-next] bpf: Replace scratch PTE atomically when allocating arena pages Tejun Heo
  2026-06-01 18:58 ` sashiko-bot
       [not found] ` <f45f02410d23c99d14f3577da9b27a327816b5a0904bdeefc8e229eac760d1cf@mail.kernel.org>
@ 2026-06-02 22:29 ` Kumar Kartikeya Dwivedi
  2026-06-05 15:30 ` patchwork-bot+netdevbpf
  3 siblings, 0 replies; 13+ messages in thread
From: Kumar Kartikeya Dwivedi @ 2026-06-02 22:29 UTC (permalink / raw)
  To: Tejun Heo, void, arighi, changwoo, ast, andrii, daniel,
	martin.lau, memxor
  Cc: peterz, catalin.marinas, will, tglx, mingo, bp, dave.hansen, akpm,
	david, rppt, emil, sched-ext, bpf, x86, linux-arm-kernel,
	linux-mm, linux-kernel

On Mon Jun 1, 2026 at 8:37 PM CEST, Tejun Heo wrote:
> apply_range_set_cb() maps the pages for a new arena allocation and returned
> -EBUSY when the target PTE was already populated. Kernel-fault recovery
> leaves the per-arena scratch page in unallocated arena PTEs, so a later
> bpf_arena_alloc_pages() over such a page hits that -EBUSY, and every
> subsequent allocation of it fails the same way. Allocation must install the
> real page over scratch instead.
>
> Overwriting the scratch PTE in place is a valid->valid change, which arm64
> forbids without break-before-make. Route through an invalid entry instead:
> ptep_try_set() fills only a none slot, so the PTE goes scratch->none->page.
> On finding scratch, clear it and flush_tlb_before_set() before retrying. The
> new flush_tlb_before_set() is a no-op except on arches like arm64 that need
> the break-before-make TLB invalidate. The loop also copes with a concurrent
> fault re-scratching the slot.
>
> Arches without ptep_try_set() never install the scratch page, so keep the
> must-be-empty check and set_pte_at() for them.
>
> Fixes: dc11a4dba246 ("bpf: Recover arena kernel faults with scratch page")
> Signed-off-by: Tejun Heo <tj@kernel.org>
> Cc: Alexei Starovoitov <ast@kernel.org>
> Cc: David Hildenbrand <david@kernel.org>
> ---
>

Acked-by: Kumar Kartikeya Dwivedi <memxor@gmail.com>

^ permalink raw reply	[flat|nested] 13+ messages in thread

* Re: [PATCH bpf-next] bpf: Replace scratch PTE atomically when allocating arena pages
  2026-06-01 18:37 [PATCH bpf-next] bpf: Replace scratch PTE atomically when allocating arena pages Tejun Heo
                   ` (2 preceding siblings ...)
  2026-06-02 22:29 ` [PATCH bpf-next] bpf: Replace scratch PTE atomically when allocating arena pages Kumar Kartikeya Dwivedi
@ 2026-06-05 15:30 ` patchwork-bot+netdevbpf
  3 siblings, 0 replies; 13+ messages in thread
From: patchwork-bot+netdevbpf @ 2026-06-05 15:30 UTC (permalink / raw)
  To: Tejun Heo
  Cc: void, arighi, changwoo, ast, andrii, daniel, martin.lau, memxor,
	peterz, catalin.marinas, will, tglx, mingo, bp, dave.hansen, akpm,
	david, rppt, emil, sched-ext, bpf, x86, linux-arm-kernel,
	linux-mm, linux-kernel

Hello:

This patch was applied to bpf/bpf-next.git (master)
by Alexei Starovoitov <ast@kernel.org>:

On Mon,  1 Jun 2026 08:37:28 -1000 you wrote:
> apply_range_set_cb() maps the pages for a new arena allocation and returned
> -EBUSY when the target PTE was already populated. Kernel-fault recovery
> leaves the per-arena scratch page in unallocated arena PTEs, so a later
> bpf_arena_alloc_pages() over such a page hits that -EBUSY, and every
> subsequent allocation of it fails the same way. Allocation must install the
> real page over scratch instead.
> 
> [...]

Here is the summary with links:
  - [bpf-next] bpf: Replace scratch PTE atomically when allocating arena pages
    https://git.kernel.org/bpf/bpf-next/c/f64c723741c9

You are awesome, thank you!
-- 
Deet-doot-dot, I am a bot.
https://korg.docs.kernel.org/patchwork/pwbot.html



^ permalink raw reply	[flat|nested] 13+ messages in thread

* Re: [PATCH bpf-next] bpf: Replace scratch PTE atomically when allocating arena pages
  2026-06-02 22:09   ` Tejun Heo
@ 2026-06-06 16:06     ` Catalin Marinas
  2026-06-07  7:59       ` [PATCH bpf-next] arm64: mm: Complete the PTE store in ptep_try_set() Tejun Heo
  0 siblings, 1 reply; 13+ messages in thread
From: Catalin Marinas @ 2026-06-06 16:06 UTC (permalink / raw)
  To: Tejun Heo
  Cc: bot+bpf-ci, void, arighi, changwoo, ast, andrii, daniel,
	martin.lau, memxor, peterz, will, tglx, mingo, bp, dave.hansen,
	akpm, david, rppt, emil, sched-ext, bpf, x86, linux-arm-kernel,
	linux-mm, linux-kernel, eddyz87, yonghong.song, clm,
	ihor.solodrai

On Tue, Jun 02, 2026 at 12:09:11PM -1000, Tejun Heo wrote:
> On Mon, Jun 01, 2026 at 08:15:34PM +0000, bot+bpf-ci@kernel.org wrote:
> > After the real page is installed without a flush, can that stale
> > kaddr -> scratch_page translation persist, so that later kernel-side
> > accesses at kaddr reach the shared per-arena scratch page instead of
> > the freshly allocated page?
> 
> It can on x86, but it's harmless: that CPU faulted on an unallocated
> address and got scratch-recovered, so reaching either the scratch or the
> real page is fine. No flush needed.

I think for arm64 it will be slightly different. After making the pte
invalid, we flush the TLBs and subsequent access will be fault. However,
ptep_try_set() is missing __set_pte_complete() with the necessary
barriers. A subsequent access may fault rather than hit the old or the
new page. Something like below, as a fixup for 258df8fce42f ("mm: Add
ptep_try_set() for lockless empty-slot installs"):

diff --git a/arch/arm64/include/asm/pgtable.h b/arch/arm64/include/asm/pgtable.h
index 3ce0f2a6cab6..dc8525431273 100644
--- a/arch/arm64/include/asm/pgtable.h
+++ b/arch/arm64/include/asm/pgtable.h
@@ -1838,7 +1838,11 @@ static inline bool ptep_try_set(pte_t *ptep, pte_t new_pte)
 {
 	pteval_t old = 0;
 
-	return try_cmpxchg(&pte_val(*ptep), &old, pte_val(new_pte));
+	if (!try_cmpxchg(&pte_val(*ptep), &old, pte_val(new_pte)))
+		return false;
+
+	__set_pte_complete(new_pte);
+	return true;
 }
 #define ptep_try_set ptep_try_set
 

-- 
Catalin

^ permalink raw reply related	[flat|nested] 13+ messages in thread

* [PATCH bpf-next] arm64: mm: Complete the PTE store in ptep_try_set()
  2026-06-06 16:06     ` Catalin Marinas
@ 2026-06-07  7:59       ` Tejun Heo
  2026-06-07  8:12         ` sashiko-bot
  2026-06-07  8:38         ` bot+bpf-ci
  0 siblings, 2 replies; 13+ messages in thread
From: Tejun Heo @ 2026-06-07  7:59 UTC (permalink / raw)
  To: Catalin Marinas, Will Deacon, Alexei Starovoitov
  Cc: David Hildenbrand, Andrea Righi, Kumar Kartikeya Dwivedi,
	Andrew Morton, Mike Rapoport, Andrii Nakryiko, Daniel Borkmann,
	Martin KaFai Lau, Eduard Zingerman, Yonghong Song,
	Emil Tsalapatis, David Vernet, Changwoo Min, linux-arm-kernel,
	linux-mm, bpf, linux-kernel

ptep_try_set() installs the new entry with try_cmpxchg() but, unlike
__set_pte(), never calls __set_pte_complete(). On arm64, installing a valid
kernel PTE requires barriers afterward so a subsequent access observes it.
Without them the access can fault instead of reaching the freshly installed
page.

Call __set_pte_complete() after a successful cmpxchg, mirroring
__set_pte().

Fixes: 258df8fce42f ("mm: Add ptep_try_set() for lockless empty-slot installs")
Suggested-by: Catalin Marinas <catalin.marinas@arm.com>
Link: https://lore.kernel.org/all/aiRFcz78QTZdIHHB@arm.com/
Signed-off-by: Tejun Heo <tj@kernel.org>
---
 arch/arm64/include/asm/pgtable.h | 6 +++++-
 1 file changed, 5 insertions(+), 1 deletion(-)

diff --git a/arch/arm64/include/asm/pgtable.h b/arch/arm64/include/asm/pgtable.h
index 3ce0f2a6cab6..dc8525431273 100644
--- a/arch/arm64/include/asm/pgtable.h
+++ b/arch/arm64/include/asm/pgtable.h
@@ -1838,7 +1838,11 @@ static inline bool ptep_try_set(pte_t *ptep, pte_t new_pte)
 {
 	pteval_t old = 0;
 
-	return try_cmpxchg(&pte_val(*ptep), &old, pte_val(new_pte));
+	if (!try_cmpxchg(&pte_val(*ptep), &old, pte_val(new_pte)))
+		return false;
+
+	__set_pte_complete(new_pte);
+	return true;
 }
 #define ptep_try_set ptep_try_set
 
-- 
2.51.1


^ permalink raw reply related	[flat|nested] 13+ messages in thread

* Re: [PATCH bpf-next] arm64: mm: Complete the PTE store in ptep_try_set()
  2026-06-07  7:59       ` [PATCH bpf-next] arm64: mm: Complete the PTE store in ptep_try_set() Tejun Heo
@ 2026-06-07  8:12         ` sashiko-bot
  2026-06-07  8:38         ` bot+bpf-ci
  1 sibling, 0 replies; 13+ messages in thread
From: sashiko-bot @ 2026-06-07  8:12 UTC (permalink / raw)
  To: Tejun Heo; +Cc: bpf

Thank you for your contribution! Sashiko AI review found 1 potential issue(s) to consider:
- [High] The patch uses `__set_pte_complete()` in `ptep_try_set()`, which incorrectly defers architectural PTE barriers if a synchronous BPF fault interrupts a lazy MMU block, leading to an infinite page fault livelock.
--

commit 3da2cf1409cd9f964e0879e7b2ee1aaae71958ab
Author: Tejun Heo <tj@kernel.org>

arm64: mm: Complete the PTE store in ptep_try_set()

This commit updates ptep_try_set() to call __set_pte_complete() after
successfully setting a new PTE via try_cmpxchg(). This mirrors the behavior
in __set_pte() to ensure architectural barriers are issued for valid kernel
PTEs on arm64.

Link: https://lore.kernel.org/all/aiRFcz78QTZdIHHB@arm.com/

> diff --git a/arch/arm64/include/asm/pgtable.h b/arch/arm64/include/asm/pgtable.h
> index 984f0502c9d0f..9f5d75e736919 100644
> --- a/arch/arm64/include/asm/pgtable.h
> +++ b/arch/arm64/include/asm/pgtable.h
> @@ -1838,7 +1838,11 @@ static inline bool ptep_try_set(pte_t *ptep, pte_t new_pte)
>  {
>  	pteval_t old = 0;
>  
> -	return try_cmpxchg(&pte_val(*ptep), &old, pte_val(new_pte));
> +	if (!try_cmpxchg(&pte_val(*ptep), &old, pte_val(new_pte)))
> +		return false;
> +
> +	__set_pte_complete(new_pte);

[Severity: High]
Does calling __set_pte_complete() here risk deferring required architectural
barriers during synchronous BPF arena page faults like
bpf_arena_kernel_fault()?

If a BPF program triggers while the interrupted task is inside a lazy MMU
block (like zap_pte_range()), the kprobe runs as a synchronous debug
exception, so in_interrupt() is false.

__set_pte_complete() delegates to queue_pte_barriers(), which defers the
hardware barriers by setting TIF_LAZY_MMU_PENDING if
is_lazy_mmu_mode_active() returns true. Since in_interrupt() is false,
is_lazy_mmu_mode_active() will evaluate to true.

Because the required dsb is deferred, the fault handler can return without
the memory system observing the new PTE. The CPU then retries the faulting
instruction, which faults again, resulting in an infinite page fault loop.

> +	return true;
>  }
>  #define ptep_try_set ptep_try_set

-- 
Sashiko AI review · https://sashiko.dev/#/patchset/088f52fd25860ca961449d53f91b214a@kernel.org?part=1

^ permalink raw reply	[flat|nested] 13+ messages in thread

* Re: [PATCH bpf-next] arm64: mm: Complete the PTE store in ptep_try_set()
  2026-06-07  7:59       ` [PATCH bpf-next] arm64: mm: Complete the PTE store in ptep_try_set() Tejun Heo
  2026-06-07  8:12         ` sashiko-bot
@ 2026-06-07  8:38         ` bot+bpf-ci
  2026-06-07 20:04           ` Tejun Heo
  1 sibling, 1 reply; 13+ messages in thread
From: bot+bpf-ci @ 2026-06-07  8:38 UTC (permalink / raw)
  To: tj, catalin.marinas, will, ast
  Cc: david, arighi, memxor, akpm, rppt, andrii, daniel, martin.lau,
	eddyz87, yonghong.song, emil, void, changwoo, linux-arm-kernel,
	linux-mm, bpf, linux-kernel, ast, andrii, daniel, martin.lau,
	eddyz87, yonghong.song, clm, ihor.solodrai

[-- Attachment #1: Type: text/plain, Size: 1949 bytes --]

> diff --git a/arch/arm64/include/asm/pgtable.h b/arch/arm64/include/asm/pgtable.h
> index 3ce0f2a6cab6..dc8525431273 100644
> --- a/arch/arm64/include/asm/pgtable.h
> +++ b/arch/arm64/include/asm/pgtable.h
> @@ -1838,7 +1838,11 @@ static inline bool ptep_try_set(pte_t *ptep, pte_t new_pte)
>  {
>  	pteval_t old = 0;
>
> -	return try_cmpxchg(&pte_val(*ptep), &old, pte_val(new_pte));
> +	if (!try_cmpxchg(&pte_val(*ptep), &old, pte_val(new_pte)))
> +		return false;
> +
> +	__set_pte_complete(new_pte);

An earlier posting of this drew a question on the mailing list about
__set_pte_complete() deferring the barriers in some contexts:

  https://lore.kernel.org/all/20260607081229.12BC21F00893@smtp.kernel.org/

The concern is that __set_pte_complete() delegates to
queue_pte_barriers(), which only sets TIF_LAZY_MMU_PENDING and defers the
dsb when is_lazy_mmu_mode_active() is true.

The scenario raised is a synchronous BPF arena fault (for example via a
kprobe, where in_interrupt() is false) interrupting a task that is inside a
lazy MMU block such as zap_pte_range(). There is_lazy_mmu_mode_active()
would be true, so the dsb is deferred, and the fault handler could return
before the memory system observes the new PTE. The faulting instruction
would then retry and fault again.

Can this path actually loop, or is the deferred barrier guaranteed to be
flushed before the faulting instruction is retried?

Note this deferral mechanism is the same one used by the existing
__set_pte() path that this change deliberately mirrors, and the comment
above came from an automated review rather than a human, so the scenario
has not been confirmed.

> +	return true;
>  }
>  #define ptep_try_set ptep_try_set

---
AI reviewed your patch. Please fix the bug or email reply why it's not a bug.
See: https://github.com/kernel-patches/vmtest/blob/master/ci/claude/README.md

CI run summary: https://github.com/kernel-patches/bpf/actions/runs/27087148382

^ permalink raw reply	[flat|nested] 13+ messages in thread

* Re: [PATCH bpf-next] arm64: mm: Complete the PTE store in ptep_try_set()
  2026-06-07  8:38         ` bot+bpf-ci
@ 2026-06-07 20:04           ` Tejun Heo
  2026-06-07 20:31             ` Catalin Marinas
  0 siblings, 1 reply; 13+ messages in thread
From: Tejun Heo @ 2026-06-07 20:04 UTC (permalink / raw)
  To: bot+bpf-ci, Catalin Marinas, Will Deacon, Alexei Starovoitov
  Cc: david, arighi, memxor, akpm, rppt, andrii, daniel, martin.lau,
	martin.lau, eddyz87, yonghong.song, emil, void, changwoo, clm,
	ihor.solodrai, linux-arm-kernel, linux-mm, bpf, linux-kernel

> Can this path actually loop, or is the deferred barrier guaranteed to be
> flushed before the faulting instruction is retried?

I don't know the arm64 paths well enough to say. What I can see is that
ptep_try_set() only runs as an apply_to_page_range() callback, and
apply_to_pte_range() brackets it with lazy_mmu_mode_enable()/disable(), with
the disable() flushing TIF_LAZY_MMU_PENDING before returning. The barriers
would land before the access is retried. It also looks like the same
queue_pte_barriers() path __set_pte() already uses. I'd defer to Catalin and
the arm64 folks on whether that actually closes the case.

Thanks.

--
tejun

^ permalink raw reply	[flat|nested] 13+ messages in thread

* Re: [PATCH bpf-next] arm64: mm: Complete the PTE store in ptep_try_set()
  2026-06-07 20:04           ` Tejun Heo
@ 2026-06-07 20:31             ` Catalin Marinas
  2026-06-08  7:25               ` [PATCH v2 " Tejun Heo
  0 siblings, 1 reply; 13+ messages in thread
From: Catalin Marinas @ 2026-06-07 20:31 UTC (permalink / raw)
  To: Tejun Heo
  Cc: bot+bpf-ci, Will Deacon, Alexei Starovoitov, david, arighi,
	memxor, akpm, rppt, andrii, daniel, martin.lau, martin.lau,
	eddyz87, yonghong.song, emil, void, changwoo, clm, ihor.solodrai,
	linux-arm-kernel, linux-mm, bpf, linux-kernel

On Sun, Jun 07, 2026 at 10:04:19AM -1000, Tejun Heo wrote:
> > Can this path actually loop, or is the deferred barrier guaranteed to be
> > flushed before the faulting instruction is retried?
> 
> I don't know the arm64 paths well enough to say. What I can see is that
> ptep_try_set() only runs as an apply_to_page_range() callback, and
> apply_to_pte_range() brackets it with lazy_mmu_mode_enable()/disable(), with
> the disable() flushing TIF_LAZY_MMU_PENDING before returning. The barriers
> would land before the access is retried. It also looks like the same
> queue_pte_barriers() path __set_pte() already uses. I'd defer to Catalin and
> the arm64 folks on whether that actually closes the case.

I don't fully understand the BPF parts but I think the bots have a
point. If a BPF kprobe fires while we are in lazy mmu mode,
__set_pte_complete() will defer issuing the barriers.

I think better to just call emit_pte_barriers() directly. If
ptep_try_set() is always called with valid kernel ptes, we can skip the
if (pte_valid_not_user()) check as well (which was just an optimisation
anyway).

-- 
Catalin

^ permalink raw reply	[flat|nested] 13+ messages in thread

* [PATCH v2 bpf-next] arm64: mm: Complete the PTE store in ptep_try_set()
  2026-06-07 20:31             ` Catalin Marinas
@ 2026-06-08  7:25               ` Tejun Heo
  0 siblings, 0 replies; 13+ messages in thread
From: Tejun Heo @ 2026-06-08  7:25 UTC (permalink / raw)
  To: Catalin Marinas, Will Deacon, Alexei Starovoitov
  Cc: David Hildenbrand, Andrea Righi, Kumar Kartikeya Dwivedi,
	Andrew Morton, Mike Rapoport, Andrii Nakryiko, Daniel Borkmann,
	Martin KaFai Lau, Eduard Zingerman, Yonghong Song,
	Emil Tsalapatis, David Vernet, Changwoo Min, linux-arm-kernel,
	linux-mm, bpf, linux-kernel

ptep_try_set() installs a kernel PTE with try_cmpxchg() but, unlike
__set_pte(), skips the barriers that arm64 requires after writing a valid
kernel PTE. Without them a subsequent access can fault instead of seeing
the new mapping.

Issue them with emit_pte_barriers() rather than __set_pte_complete().
ptep_try_set() must finish the store before it returns, but
__set_pte_complete() would defer the barriers when the calling context is in
lazy MMU mode.

v2: Emit the barriers directly instead of __set_pte_complete(). (Catalin)

Fixes: 258df8fce42f ("mm: Add ptep_try_set() for lockless empty-slot installs")
Suggested-by: Catalin Marinas <catalin.marinas@arm.com>
Link: https://lore.kernel.org/all/aiRFcz78QTZdIHHB@arm.com/
Signed-off-by: Tejun Heo <tj@kernel.org>
---
 arch/arm64/include/asm/pgtable.h | 11 ++++++++++-
 1 file changed, 10 insertions(+), 1 deletion(-)

diff --git a/arch/arm64/include/asm/pgtable.h b/arch/arm64/include/asm/pgtable.h
index 3ce0f2a6cab6..3e579c26b383 100644
--- a/arch/arm64/include/asm/pgtable.h
+++ b/arch/arm64/include/asm/pgtable.h
@@ -1838,7 +1838,16 @@ static inline bool ptep_try_set(pte_t *ptep, pte_t new_pte)
 {
 	pteval_t old = 0;
 
-	return try_cmpxchg(&pte_val(*ptep), &old, pte_val(new_pte));
+	if (!try_cmpxchg(&pte_val(*ptep), &old, pte_val(new_pte)))
+		return false;
+
+	/*
+	 * The store must be complete by the time this returns, but the caller
+	 * may be in lazy MMU mode, where __set_pte_complete() would defer the
+	 * barriers. Issue them directly.
+	 */
+	emit_pte_barriers();
+	return true;
 }
 #define ptep_try_set ptep_try_set
 
-- 
2.51.1

^ permalink raw reply related	[flat|nested] 13+ messages in thread

end of thread, other threads:[~2026-06-08  7:25 UTC | newest]

Thread overview: 13+ messages (download: mbox.gz follow: Atom feed
-- links below jump to the message on this page --
2026-06-01 18:37 [PATCH bpf-next] bpf: Replace scratch PTE atomically when allocating arena pages Tejun Heo
2026-06-01 18:58 ` sashiko-bot
2026-06-01 19:19   ` Tejun Heo
     [not found] ` <f45f02410d23c99d14f3577da9b27a327816b5a0904bdeefc8e229eac760d1cf@mail.kernel.org>
2026-06-02 22:09   ` Tejun Heo
2026-06-06 16:06     ` Catalin Marinas
2026-06-07  7:59       ` [PATCH bpf-next] arm64: mm: Complete the PTE store in ptep_try_set() Tejun Heo
2026-06-07  8:12         ` sashiko-bot
2026-06-07  8:38         ` bot+bpf-ci
2026-06-07 20:04           ` Tejun Heo
2026-06-07 20:31             ` Catalin Marinas
2026-06-08  7:25               ` [PATCH v2 " Tejun Heo
2026-06-02 22:29 ` [PATCH bpf-next] bpf: Replace scratch PTE atomically when allocating arena pages Kumar Kartikeya Dwivedi
2026-06-05 15:30 ` patchwork-bot+netdevbpf

This is a public inbox, see mirroring instructions
for how to clone and mirror all data and code used for this inbox