From: Puranjay Mohan <puranjay@kernel.org>
To: bot+bpf-ci@kernel.org, bpf@vger.kernel.org
Cc: ast@kernel.org, andrii@kernel.org, daniel@iogearbox.net,
martin.lau@kernel.org, eddyz87@gmail.com, memxor@gmail.com,
kernel-team@meta.com, ast@kernel.org, andrii@kernel.org,
daniel@iogearbox.net, martin.lau@kernel.org, eddyz87@gmail.com,
yonghong.song@linux.dev, clm@meta.com, ihor.solodrai@linux.dev
Subject: Re: [PATCH bpf-next v2 3/4] bpf: arena: make arena kfuncs any context safe
Date: Fri, 14 Nov 2025 15:28:54 +0000 [thread overview]
Message-ID: <mb61pqzu01vm1.fsf@kernel.org> (raw)
In-Reply-To: <b0a8bdc792e9424566da4b764770c86f4af787e6bd714162917163768f2af4b4@mail.kernel.org>
bot+bpf-ci@kernel.org writes:
>> diff --git a/kernel/bpf/arena.c b/kernel/bpf/arena.c
>> index 7fa6e40ab..ca443c113 100644
>> --- a/kernel/bpf/arena.c
>> +++ b/kernel/bpf/arena.c
>
> [ ... ]
>
>> @@ -331,12 +355,19 @@ static vm_fault_t arena_vm_fault(struct vm_fault *vmf)
>> struct bpf_arena *arena = container_of(map, struct bpf_arena, map);
>> struct page *page;
>> long kbase, kaddr;
>> + unsigned long flags;
>> int ret;
>>
>> kbase = bpf_arena_get_kern_vm_start(arena);
>> kaddr = kbase + (u32)(vmf->address);
>>
>> - guard(mutex)(&arena->lock);
>> + if (raw_res_spin_lock_irqsave(&arena->spinlock, flags))
>> + /*
>> + * This is an impossible case and would only trigger if res_spin_lock is buggy or
>> + * due to another kernel bug.
>> + */
>> + return VM_FAULT_RETRY;
>> +
>> page = vmalloc_to_page((void *)kaddr);
>> if (page)
>> /* already have a page vmap-ed */
>> @@ -348,26 +379,30 @@ static vm_fault_t arena_vm_fault(struct vm_fault *vmf)
>>
>> ret = range_tree_clear(&arena->rt, vmf->pgoff, 1);
>> if (ret)
>> - return VM_FAULT_SIGSEGV;
>> + goto out_unlock_sigsegv;
>>
>> struct apply_range_data data = { .pages = &page, .i = 0 };
>> /* Account into memcg of the process that created bpf_arena */
>> ret = bpf_map_alloc_pages(map, NUMA_NO_NODE, 1, &page);
>> if (ret) {
>> range_tree_set(&arena->rt, vmf->pgoff, 1);
>> - return VM_FAULT_SIGSEGV;
>> + goto out_unlock_sigsegv;
>> }
>>
>> ret = apply_to_page_range(&init_mm, kaddr, PAGE_SIZE, apply_range_set_cb, &data);
>> if (ret) {
>> range_tree_set(&arena->rt, vmf->pgoff, 1);
>> - __free_page(page);
>> - return VM_FAULT_SIGSEGV;
>> + free_pages_nolock(page, 0);
>> + goto out_unlock_sigsegv;
>> }
>> out:
>> page_ref_add(page, 1);
>> + raw_res_spin_unlock_irqrestore(&arena->spinlock, flags);
>> vmf->page = page;
>> return 0;
>
> In arena_vm_fault(), there's a path between acquiring the spinlock and
> the new goto labels that doesn't release the lock:
>
>> if (arena->map.map_flags & BPF_F_SEGV_ON_FAULT)
>> return VM_FAULT_SIGSEGV;
>
> This return statement is reached after raw_res_spin_lock_irqsave()
> succeeds but before any of the unlock paths. If BPF_F_SEGV_ON_FAULT is
> set, subsequent page faults on this arena will deadlock. All other
> return paths in the function properly jump to out_unlock_sigsegv to
> release the lock.
Good catch, this needs to be goto out_unlock_sigsegv;
>> +out_unlock_sigsegv:
>> + raw_res_spin_unlock_irqrestore(&arena->spinlock, flags);
>> + return VM_FAULT_SIGSEGV;
>> }
>
> [ ... ]
>
>> @@ -594,42 +633,65 @@ static void zap_pages(struct bpf_arena *arena, long uaddr, long page_cnt)
>> {
>> struct vma_list *vml;
>>
>> + guard(mutex)(&arena->lock);
>> + /* iterate link list under lock */
>> list_for_each_entry(vml, &arena->vma_list, head)
>> zap_page_range_single(vml->vma, uaddr,
>> PAGE_SIZE * page_cnt, NULL);
>> }
>>
>> -static void arena_free_pages(struct bpf_arena *arena, long uaddr, long page_cnt)
>> +static void arena_free_pages(struct bpf_arena *arena, long uaddr, long page_cnt, bool sleepable)
>> {
>> u64 full_uaddr, uaddr_end;
>> - long kaddr, pgoff, i;
>> + long kaddr, pgoff;
>> struct page *page;
>> + struct llist_head free_pages;
>> + struct llist_node *pos, *t;
>> + struct arena_free_span *s;
>> + unsigned long flags;
>> + int ret = 0;
>>
>> /* only aligned lower 32-bit are relevant */
>> uaddr = (u32)uaddr;
>> uaddr &= PAGE_MASK;
>> + kaddr = bpf_arena_get_kern_vm_start(arena) + uaddr;
>> full_uaddr = clear_lo32(arena->user_vm_start) + uaddr;
>> uaddr_end = min(arena->user_vm_end, full_uaddr + (page_cnt << PAGE_SHIFT));
>> if (full_uaddr >= uaddr_end)
>> return;
>>
>> page_cnt = (uaddr_end - full_uaddr) >> PAGE_SHIFT;
>> + pgoff = compute_pgoff(arena, uaddr);
>>
>> - guard(mutex)(&arena->lock);
>> + if (!sleepable)
>> + goto defer;
>> +
>> + ret = raw_res_spin_lock_irqsave(&arena->spinlock, flags);
>> + /*
>> + * Can't proceed without holding the spinlock so defer the free
>> + */
>> + if (ret)
>> + goto defer;
>
> [ ... ]
>
>> + return;
>> +
>> +defer:
>> + s = kmalloc_nolock(sizeof(struct arena_free_span), 0, -1);
>> + if (!s)
>> + return;
>
> In arena_free_pages(), when the deferred free path is taken but
> kmalloc_nolock() fails, the function returns silently without freeing
> the pages. These pages remain mapped in the kernel page tables and
> marked as allocated in the range tree, but become unusable.
>
> This can happen when:
> - The function is called from a non-sleepable context (sleepable=false)
> - Lock acquisition fails and falls through to defer
> - kmalloc_nolock() returns NULL (PREEMPT_RT+IRQ, lock contention, or
> unsupported arch)
>
> The deferred free path is called from arena_alloc_pages() error
> handling and bpf_arena_free_pages_non_sleepable(). In both cases, the
> pages that should be freed will leak.
>
> Should there be a fallback mechanism or at least a warning when
> deferral fails?
Yes, this is expected because if kmalloc_nolock() fails in non-sleepable
context, we don't have another way to get memory for arena_free_span, we
are accepting this trade-off, failing a 24 byte allocation here is
highly unlikely, but we can have a WARN_ONCE here. We had an offline
discussion about having debug counters from arena, maybe we can add a
counter here.
And for failure path of arena_alloc_pages, we could make
arena_alloc_pages get a sleepable parameter and call arena_free_pages()
with this parameter so we don't defer unnecessarily when
arena_alloc_pages() is called in sleepable context.
>> +
>> + s->page_cnt = page_cnt;
>> + s->uaddr = uaddr;
>> + llist_add(&s->node, &arena->free_spans);
>> + irq_work_queue(&arena->free_irq);
>> }
>
>
> ---
> AI reviewed your patch. Please fix the bug or email reply why it's not a bug.
> See: https://github.com/kernel-patches/vmtest/blob/master/ci/claude/README.md
>
> CI run summary: https://github.com/kernel-patches/bpf/actions/runs/19363121319
next prev parent reply other threads:[~2025-11-14 15:28 UTC|newest]
Thread overview: 22+ messages / expand[flat|nested] mbox.gz Atom feed top
2025-11-14 11:16 [PATCH bpf-next v2 0/4] Remove KF_SLEEPABLE from arena kfuncs Puranjay Mohan
2025-11-14 11:16 ` [PATCH bpf-next v2 1/4] bpf: arena: populate vm_area without allocating memory Puranjay Mohan
2025-11-14 11:47 ` bot+bpf-ci
2025-11-14 14:57 ` Puranjay Mohan
2025-11-14 21:21 ` Alexei Starovoitov
2025-11-15 0:52 ` Puranjay Mohan
2025-11-15 1:26 ` Alexei Starovoitov
2025-11-14 11:16 ` [PATCH bpf-next v2 2/4] bpf: arena: use kmalloc_nolock() in place of kvcalloc() Puranjay Mohan
2025-11-14 11:39 ` bot+bpf-ci
2025-11-14 15:13 ` Puranjay Mohan
2025-11-14 21:25 ` Alexei Starovoitov
2025-11-14 11:16 ` [PATCH bpf-next v2 3/4] bpf: arena: make arena kfuncs any context safe Puranjay Mohan
2025-11-14 11:47 ` bot+bpf-ci
2025-11-14 15:28 ` Puranjay Mohan [this message]
2025-11-14 21:27 ` Alexei Starovoitov
2025-11-15 0:56 ` Puranjay Mohan
2025-11-15 1:28 ` Alexei Starovoitov
2025-11-15 8:18 ` kernel test robot
2025-11-16 1:15 ` kernel test robot
2025-11-14 11:16 ` [PATCH bpf-next v2 4/4] selftests: bpf: test non-sleepable arena allocations Puranjay Mohan
2025-11-14 22:18 ` Alexei Starovoitov
2025-11-15 0:58 ` Puranjay Mohan
Reply instructions:
You may reply publicly to this message via plain-text email
using any one of the following methods:
* Save the following mbox file, import it into your mail client,
and reply-to-all from there: mbox
Avoid top-posting and favor interleaved quoting:
https://en.wikipedia.org/wiki/Posting_style#Interleaved_style
* Reply using the --to, --cc, and --in-reply-to
switches of git-send-email(1):
git send-email \
--in-reply-to=mb61pqzu01vm1.fsf@kernel.org \
--to=puranjay@kernel.org \
--cc=andrii@kernel.org \
--cc=ast@kernel.org \
--cc=bot+bpf-ci@kernel.org \
--cc=bpf@vger.kernel.org \
--cc=clm@meta.com \
--cc=daniel@iogearbox.net \
--cc=eddyz87@gmail.com \
--cc=ihor.solodrai@linux.dev \
--cc=kernel-team@meta.com \
--cc=martin.lau@kernel.org \
--cc=memxor@gmail.com \
--cc=yonghong.song@linux.dev \
/path/to/YOUR_REPLY
https://kernel.org/pub/software/scm/git/docs/git-send-email.html
* If your mail client supports setting the In-Reply-To header
via mailto: links, try the mailto: link
Be sure your reply has a Subject: header at the top and a blank line
before the message body.
This is a public inbox, see mirroring instructions
for how to clone and mirror all data and code used for this inbox