Re: [PATCH bpf-next v2 3/4] bpf: arena: make arena kfuncs any context safe

All of lore.kernel.org
 help / color / mirror / Atom feed

From: Puranjay Mohan <puranjay@kernel.org>
To: bot+bpf-ci@kernel.org, bpf@vger.kernel.org
Cc: ast@kernel.org, andrii@kernel.org, daniel@iogearbox.net,
	martin.lau@kernel.org, eddyz87@gmail.com, memxor@gmail.com,
	kernel-team@meta.com, ast@kernel.org, andrii@kernel.org,
	daniel@iogearbox.net, martin.lau@kernel.org, eddyz87@gmail.com,
	yonghong.song@linux.dev, clm@meta.com, ihor.solodrai@linux.dev
Subject: Re: [PATCH bpf-next v2 3/4] bpf: arena: make arena kfuncs any context safe
Date: Fri, 14 Nov 2025 15:28:54 +0000	[thread overview]
Message-ID: <mb61pqzu01vm1.fsf@kernel.org> (raw)
In-Reply-To: <b0a8bdc792e9424566da4b764770c86f4af787e6bd714162917163768f2af4b4@mail.kernel.org>

bot+bpf-ci@kernel.org writes:

>> diff --git a/kernel/bpf/arena.c b/kernel/bpf/arena.c
>> index 7fa6e40ab..ca443c113 100644
>> --- a/kernel/bpf/arena.c
>> +++ b/kernel/bpf/arena.c
>
> [ ... ]
>
>> @@ -331,12 +355,19 @@ static vm_fault_t arena_vm_fault(struct vm_fault *vmf)
>>  	struct bpf_arena *arena = container_of(map, struct bpf_arena, map);
>>  	struct page *page;
>>  	long kbase, kaddr;
>> +	unsigned long flags;
>>  	int ret;
>>
>>  	kbase = bpf_arena_get_kern_vm_start(arena);
>>  	kaddr = kbase + (u32)(vmf->address);
>>
>> -	guard(mutex)(&arena->lock);
>> +	if (raw_res_spin_lock_irqsave(&arena->spinlock, flags))
>> +		/*
>> +		 * This is an impossible case and would only trigger if res_spin_lock is buggy or
>> +		 * due to another kernel bug.
>> +		 */
>> +		return VM_FAULT_RETRY;
>> +
>>  	page = vmalloc_to_page((void *)kaddr);
>>  	if (page)
>>  		/* already have a page vmap-ed */
>> @@ -348,26 +379,30 @@ static vm_fault_t arena_vm_fault(struct vm_fault *vmf)
>>
>>  	ret = range_tree_clear(&arena->rt, vmf->pgoff, 1);
>>  	if (ret)
>> -		return VM_FAULT_SIGSEGV;
>> +		goto out_unlock_sigsegv;
>>
>>  	struct apply_range_data data = { .pages = &page, .i = 0 };
>>  	/* Account into memcg of the process that created bpf_arena */
>>  	ret = bpf_map_alloc_pages(map, NUMA_NO_NODE, 1, &page);
>>  	if (ret) {
>>  		range_tree_set(&arena->rt, vmf->pgoff, 1);
>> -		return VM_FAULT_SIGSEGV;
>> +		goto out_unlock_sigsegv;
>>  	}
>>
>>  	ret = apply_to_page_range(&init_mm, kaddr, PAGE_SIZE, apply_range_set_cb, &data);
>>  	if (ret) {
>>  		range_tree_set(&arena->rt, vmf->pgoff, 1);
>> -		__free_page(page);
>> -		return VM_FAULT_SIGSEGV;
>> +		free_pages_nolock(page, 0);
>> +		goto out_unlock_sigsegv;
>>  	}
>>  out:
>>  	page_ref_add(page, 1);
>> +	raw_res_spin_unlock_irqrestore(&arena->spinlock, flags);
>>  	vmf->page = page;
>>  	return 0;
>
> In arena_vm_fault(), there's a path between acquiring the spinlock and
> the new goto labels that doesn't release the lock:
>
>>  	if (arena->map.map_flags & BPF_F_SEGV_ON_FAULT)
>>  		return VM_FAULT_SIGSEGV;
>
> This return statement is reached after raw_res_spin_lock_irqsave()
> succeeds but before any of the unlock paths. If BPF_F_SEGV_ON_FAULT is
> set, subsequent page faults on this arena will deadlock. All other
> return paths in the function properly jump to out_unlock_sigsegv to
> release the lock.

Good catch, this needs to be goto out_unlock_sigsegv;

>> +out_unlock_sigsegv:
>> +	raw_res_spin_unlock_irqrestore(&arena->spinlock, flags);
>> +	return VM_FAULT_SIGSEGV;
>>  }
>
> [ ... ]
>
>> @@ -594,42 +633,65 @@ static void zap_pages(struct bpf_arena *arena, long uaddr, long page_cnt)
>>  {
>>  	struct vma_list *vml;
>>
>> +	guard(mutex)(&arena->lock);
>> +	/* iterate link list under lock */
>>  	list_for_each_entry(vml, &arena->vma_list, head)
>>  		zap_page_range_single(vml->vma, uaddr,
>>  				      PAGE_SIZE * page_cnt, NULL);
>>  }
>>
>> -static void arena_free_pages(struct bpf_arena *arena, long uaddr, long page_cnt)
>> +static void arena_free_pages(struct bpf_arena *arena, long uaddr, long page_cnt, bool sleepable)
>>  {
>>  	u64 full_uaddr, uaddr_end;
>> -	long kaddr, pgoff, i;
>> +	long kaddr, pgoff;
>>  	struct page *page;
>> +	struct llist_head free_pages;
>> +	struct llist_node *pos, *t;
>> +	struct arena_free_span *s;
>> +	unsigned long flags;
>> +	int ret = 0;
>>
>>  	/* only aligned lower 32-bit are relevant */
>>  	uaddr = (u32)uaddr;
>>  	uaddr &= PAGE_MASK;
>> +	kaddr = bpf_arena_get_kern_vm_start(arena) + uaddr;
>>  	full_uaddr = clear_lo32(arena->user_vm_start) + uaddr;
>>  	uaddr_end = min(arena->user_vm_end, full_uaddr + (page_cnt << PAGE_SHIFT));
>>  	if (full_uaddr >= uaddr_end)
>>  		return;
>>
>>  	page_cnt = (uaddr_end - full_uaddr) >> PAGE_SHIFT;
>> +	pgoff = compute_pgoff(arena, uaddr);
>>
>> -	guard(mutex)(&arena->lock);
>> +	if (!sleepable)
>> +		goto defer;
>> +
>> +	ret = raw_res_spin_lock_irqsave(&arena->spinlock, flags);
>> +	/*
>> +	 * Can't proceed without holding the spinlock so defer the free
>> +	 */
>> +	if (ret)
>> +		goto defer;
>
> [ ... ]
>
>> +	return;
>> +
>> +defer:
>> +	s = kmalloc_nolock(sizeof(struct arena_free_span), 0, -1);
>> +	if (!s)
>> +		return;
>
> In arena_free_pages(), when the deferred free path is taken but
> kmalloc_nolock() fails, the function returns silently without freeing
> the pages. These pages remain mapped in the kernel page tables and
> marked as allocated in the range tree, but become unusable.
>
> This can happen when:
> - The function is called from a non-sleepable context (sleepable=false)
> - Lock acquisition fails and falls through to defer
> - kmalloc_nolock() returns NULL (PREEMPT_RT+IRQ, lock contention, or
>   unsupported arch)
>
> The deferred free path is called from arena_alloc_pages() error
> handling and bpf_arena_free_pages_non_sleepable(). In both cases, the
> pages that should be freed will leak.
>
> Should there be a fallback mechanism or at least a warning when
> deferral fails?

Yes, this is expected because if kmalloc_nolock() fails in non-sleepable
context, we don't have another way to get memory for arena_free_span, we
are accepting this trade-off, failing a 24 byte allocation here is
highly unlikely, but we can have a WARN_ONCE here. We had an offline
discussion about having debug counters from arena, maybe we can add a
counter here.

And for failure path of arena_alloc_pages, we could make
arena_alloc_pages get a sleepable parameter and call arena_free_pages()
with this parameter so we don't defer unnecessarily when
arena_alloc_pages() is called in sleepable context.

>> +
>> +	s->page_cnt = page_cnt;
>> +	s->uaddr = uaddr;
>> +	llist_add(&s->node, &arena->free_spans);
>> +	irq_work_queue(&arena->free_irq);
>>  }
>
>
> ---
> AI reviewed your patch. Please fix the bug or email reply why it's not a bug.
> See: https://github.com/kernel-patches/vmtest/blob/master/ci/claude/README.md
>
> CI run summary: https://github.com/kernel-patches/bpf/actions/runs/19363121319

next prev parent reply	other threads:[~2025-11-14 15:28 UTC|newest]

Thread overview: 22+ messages / expand[flat|nested]  mbox.gz  Atom feed  top
2025-11-14 11:16 [PATCH bpf-next v2 0/4] Remove KF_SLEEPABLE from arena kfuncs Puranjay Mohan
2025-11-14 11:16 ` [PATCH bpf-next v2 1/4] bpf: arena: populate vm_area without allocating memory Puranjay Mohan
2025-11-14 11:47   ` bot+bpf-ci
2025-11-14 14:57     ` Puranjay Mohan
2025-11-14 21:21       ` Alexei Starovoitov
2025-11-15  0:52         ` Puranjay Mohan
2025-11-15  1:26           ` Alexei Starovoitov
2025-11-14 11:16 ` [PATCH bpf-next v2 2/4] bpf: arena: use kmalloc_nolock() in place of kvcalloc() Puranjay Mohan
2025-11-14 11:39   ` bot+bpf-ci
2025-11-14 15:13     ` Puranjay Mohan
2025-11-14 21:25   ` Alexei Starovoitov
2025-11-14 11:16 ` [PATCH bpf-next v2 3/4] bpf: arena: make arena kfuncs any context safe Puranjay Mohan
2025-11-14 11:47   ` bot+bpf-ci
2025-11-14 15:28     ` Puranjay Mohan [this message]
2025-11-14 21:27   ` Alexei Starovoitov
2025-11-15  0:56     ` Puranjay Mohan
2025-11-15  1:28       ` Alexei Starovoitov
2025-11-15  8:18   ` kernel test robot
2025-11-16  1:15   ` kernel test robot
2025-11-14 11:16 ` [PATCH bpf-next v2 4/4] selftests: bpf: test non-sleepable arena allocations Puranjay Mohan
2025-11-14 22:18   ` Alexei Starovoitov
2025-11-15  0:58     ` Puranjay Mohan

Reply instructions:

You may reply publicly to this message via plain-text email
using any one of the following methods:

* Save the following mbox file, import it into your mail client,
  and reply-to-all from there: mbox

  Avoid top-posting and favor interleaved quoting:
  https://en.wikipedia.org/wiki/Posting_style#Interleaved_style

* Reply using the --to, --cc, and --in-reply-to
  switches of git-send-email(1):

  git send-email \
    --in-reply-to=mb61pqzu01vm1.fsf@kernel.org \
    --to=puranjay@kernel.org \
    --cc=andrii@kernel.org \
    --cc=ast@kernel.org \
    --cc=bot+bpf-ci@kernel.org \
    --cc=bpf@vger.kernel.org \
    --cc=clm@meta.com \
    --cc=daniel@iogearbox.net \
    --cc=eddyz87@gmail.com \
    --cc=ihor.solodrai@linux.dev \
    --cc=kernel-team@meta.com \
    --cc=martin.lau@kernel.org \
    --cc=memxor@gmail.com \
    --cc=yonghong.song@linux.dev \
    /path/to/YOUR_REPLY

  https://kernel.org/pub/software/scm/git/docs/git-send-email.html

* If your mail client supports setting the In-Reply-To header
  via mailto: links, try the mailto: link

Be sure your reply has a Subject: header at the top and a blank line before the message body.

This is an external index of several public inboxes,
see mirroring instructions on how to clone and mirror
all data and code used by this external index.