BPF List
 help / color / mirror / Atom feed
From: Puranjay Mohan <puranjay@kernel.org>
To: bot+bpf-ci@kernel.org, bpf@vger.kernel.org
Cc: ast@kernel.org, andrii@kernel.org, daniel@iogearbox.net,
	martin.lau@kernel.org, eddyz87@gmail.com, memxor@gmail.com,
	kernel-team@meta.com, ast@kernel.org, andrii@kernel.org,
	daniel@iogearbox.net, martin.lau@kernel.org, eddyz87@gmail.com,
	yonghong.song@linux.dev, clm@meta.com, ihor.solodrai@linux.dev
Subject: Re: [PATCH bpf-next v2 3/4] bpf: arena: make arena kfuncs any context safe
Date: Fri, 14 Nov 2025 15:28:54 +0000	[thread overview]
Message-ID: <mb61pqzu01vm1.fsf@kernel.org> (raw)
In-Reply-To: <b0a8bdc792e9424566da4b764770c86f4af787e6bd714162917163768f2af4b4@mail.kernel.org>

bot+bpf-ci@kernel.org writes:

>> diff --git a/kernel/bpf/arena.c b/kernel/bpf/arena.c
>> index 7fa6e40ab..ca443c113 100644
>> --- a/kernel/bpf/arena.c
>> +++ b/kernel/bpf/arena.c
>
> [ ... ]
>
>> @@ -331,12 +355,19 @@ static vm_fault_t arena_vm_fault(struct vm_fault *vmf)
>>  	struct bpf_arena *arena = container_of(map, struct bpf_arena, map);
>>  	struct page *page;
>>  	long kbase, kaddr;
>> +	unsigned long flags;
>>  	int ret;
>>
>>  	kbase = bpf_arena_get_kern_vm_start(arena);
>>  	kaddr = kbase + (u32)(vmf->address);
>>
>> -	guard(mutex)(&arena->lock);
>> +	if (raw_res_spin_lock_irqsave(&arena->spinlock, flags))
>> +		/*
>> +		 * This is an impossible case and would only trigger if res_spin_lock is buggy or
>> +		 * due to another kernel bug.
>> +		 */
>> +		return VM_FAULT_RETRY;
>> +
>>  	page = vmalloc_to_page((void *)kaddr);
>>  	if (page)
>>  		/* already have a page vmap-ed */
>> @@ -348,26 +379,30 @@ static vm_fault_t arena_vm_fault(struct vm_fault *vmf)
>>
>>  	ret = range_tree_clear(&arena->rt, vmf->pgoff, 1);
>>  	if (ret)
>> -		return VM_FAULT_SIGSEGV;
>> +		goto out_unlock_sigsegv;
>>
>>  	struct apply_range_data data = { .pages = &page, .i = 0 };
>>  	/* Account into memcg of the process that created bpf_arena */
>>  	ret = bpf_map_alloc_pages(map, NUMA_NO_NODE, 1, &page);
>>  	if (ret) {
>>  		range_tree_set(&arena->rt, vmf->pgoff, 1);
>> -		return VM_FAULT_SIGSEGV;
>> +		goto out_unlock_sigsegv;
>>  	}
>>
>>  	ret = apply_to_page_range(&init_mm, kaddr, PAGE_SIZE, apply_range_set_cb, &data);
>>  	if (ret) {
>>  		range_tree_set(&arena->rt, vmf->pgoff, 1);
>> -		__free_page(page);
>> -		return VM_FAULT_SIGSEGV;
>> +		free_pages_nolock(page, 0);
>> +		goto out_unlock_sigsegv;
>>  	}
>>  out:
>>  	page_ref_add(page, 1);
>> +	raw_res_spin_unlock_irqrestore(&arena->spinlock, flags);
>>  	vmf->page = page;
>>  	return 0;
>
> In arena_vm_fault(), there's a path between acquiring the spinlock and
> the new goto labels that doesn't release the lock:
>
>>  	if (arena->map.map_flags & BPF_F_SEGV_ON_FAULT)
>>  		return VM_FAULT_SIGSEGV;
>
> This return statement is reached after raw_res_spin_lock_irqsave()
> succeeds but before any of the unlock paths. If BPF_F_SEGV_ON_FAULT is
> set, subsequent page faults on this arena will deadlock. All other
> return paths in the function properly jump to out_unlock_sigsegv to
> release the lock.

Good catch, this needs to be goto out_unlock_sigsegv;

>> +out_unlock_sigsegv:
>> +	raw_res_spin_unlock_irqrestore(&arena->spinlock, flags);
>> +	return VM_FAULT_SIGSEGV;
>>  }
>
> [ ... ]
>
>> @@ -594,42 +633,65 @@ static void zap_pages(struct bpf_arena *arena, long uaddr, long page_cnt)
>>  {
>>  	struct vma_list *vml;
>>
>> +	guard(mutex)(&arena->lock);
>> +	/* iterate link list under lock */
>>  	list_for_each_entry(vml, &arena->vma_list, head)
>>  		zap_page_range_single(vml->vma, uaddr,
>>  				      PAGE_SIZE * page_cnt, NULL);
>>  }
>>
>> -static void arena_free_pages(struct bpf_arena *arena, long uaddr, long page_cnt)
>> +static void arena_free_pages(struct bpf_arena *arena, long uaddr, long page_cnt, bool sleepable)
>>  {
>>  	u64 full_uaddr, uaddr_end;
>> -	long kaddr, pgoff, i;
>> +	long kaddr, pgoff;
>>  	struct page *page;
>> +	struct llist_head free_pages;
>> +	struct llist_node *pos, *t;
>> +	struct arena_free_span *s;
>> +	unsigned long flags;
>> +	int ret = 0;
>>
>>  	/* only aligned lower 32-bit are relevant */
>>  	uaddr = (u32)uaddr;
>>  	uaddr &= PAGE_MASK;
>> +	kaddr = bpf_arena_get_kern_vm_start(arena) + uaddr;
>>  	full_uaddr = clear_lo32(arena->user_vm_start) + uaddr;
>>  	uaddr_end = min(arena->user_vm_end, full_uaddr + (page_cnt << PAGE_SHIFT));
>>  	if (full_uaddr >= uaddr_end)
>>  		return;
>>
>>  	page_cnt = (uaddr_end - full_uaddr) >> PAGE_SHIFT;
>> +	pgoff = compute_pgoff(arena, uaddr);
>>
>> -	guard(mutex)(&arena->lock);
>> +	if (!sleepable)
>> +		goto defer;
>> +
>> +	ret = raw_res_spin_lock_irqsave(&arena->spinlock, flags);
>> +	/*
>> +	 * Can't proceed without holding the spinlock so defer the free
>> +	 */
>> +	if (ret)
>> +		goto defer;
>
> [ ... ]
>
>> +	return;
>> +
>> +defer:
>> +	s = kmalloc_nolock(sizeof(struct arena_free_span), 0, -1);
>> +	if (!s)
>> +		return;
>
> In arena_free_pages(), when the deferred free path is taken but
> kmalloc_nolock() fails, the function returns silently without freeing
> the pages. These pages remain mapped in the kernel page tables and
> marked as allocated in the range tree, but become unusable.
>
> This can happen when:
> - The function is called from a non-sleepable context (sleepable=false)
> - Lock acquisition fails and falls through to defer
> - kmalloc_nolock() returns NULL (PREEMPT_RT+IRQ, lock contention, or
>   unsupported arch)
>
> The deferred free path is called from arena_alloc_pages() error
> handling and bpf_arena_free_pages_non_sleepable(). In both cases, the
> pages that should be freed will leak.
>
> Should there be a fallback mechanism or at least a warning when
> deferral fails?

Yes, this is expected because if kmalloc_nolock() fails in non-sleepable
context, we don't have another way to get memory for arena_free_span, we
are accepting this trade-off, failing a 24 byte allocation here is
highly unlikely, but we can have a WARN_ONCE here. We had an offline
discussion about having debug counters from arena, maybe we can add a
counter here.

And for failure path of arena_alloc_pages, we could make
arena_alloc_pages get a sleepable parameter and call arena_free_pages()
with this parameter so we don't defer unnecessarily when
arena_alloc_pages() is called in sleepable context.

>> +
>> +	s->page_cnt = page_cnt;
>> +	s->uaddr = uaddr;
>> +	llist_add(&s->node, &arena->free_spans);
>> +	irq_work_queue(&arena->free_irq);
>>  }
>
>
> ---
> AI reviewed your patch. Please fix the bug or email reply why it's not a bug.
> See: https://github.com/kernel-patches/vmtest/blob/master/ci/claude/README.md
>
> CI run summary: https://github.com/kernel-patches/bpf/actions/runs/19363121319

  reply	other threads:[~2025-11-14 15:28 UTC|newest]

Thread overview: 22+ messages / expand[flat|nested]  mbox.gz  Atom feed  top
2025-11-14 11:16 [PATCH bpf-next v2 0/4] Remove KF_SLEEPABLE from arena kfuncs Puranjay Mohan
2025-11-14 11:16 ` [PATCH bpf-next v2 1/4] bpf: arena: populate vm_area without allocating memory Puranjay Mohan
2025-11-14 11:47   ` bot+bpf-ci
2025-11-14 14:57     ` Puranjay Mohan
2025-11-14 21:21       ` Alexei Starovoitov
2025-11-15  0:52         ` Puranjay Mohan
2025-11-15  1:26           ` Alexei Starovoitov
2025-11-14 11:16 ` [PATCH bpf-next v2 2/4] bpf: arena: use kmalloc_nolock() in place of kvcalloc() Puranjay Mohan
2025-11-14 11:39   ` bot+bpf-ci
2025-11-14 15:13     ` Puranjay Mohan
2025-11-14 21:25   ` Alexei Starovoitov
2025-11-14 11:16 ` [PATCH bpf-next v2 3/4] bpf: arena: make arena kfuncs any context safe Puranjay Mohan
2025-11-14 11:47   ` bot+bpf-ci
2025-11-14 15:28     ` Puranjay Mohan [this message]
2025-11-14 21:27   ` Alexei Starovoitov
2025-11-15  0:56     ` Puranjay Mohan
2025-11-15  1:28       ` Alexei Starovoitov
2025-11-15  8:18   ` kernel test robot
2025-11-16  1:15   ` kernel test robot
2025-11-14 11:16 ` [PATCH bpf-next v2 4/4] selftests: bpf: test non-sleepable arena allocations Puranjay Mohan
2025-11-14 22:18   ` Alexei Starovoitov
2025-11-15  0:58     ` Puranjay Mohan

Reply instructions:

You may reply publicly to this message via plain-text email
using any one of the following methods:

* Save the following mbox file, import it into your mail client,
  and reply-to-all from there: mbox

  Avoid top-posting and favor interleaved quoting:
  https://en.wikipedia.org/wiki/Posting_style#Interleaved_style

* Reply using the --to, --cc, and --in-reply-to
  switches of git-send-email(1):

  git send-email \
    --in-reply-to=mb61pqzu01vm1.fsf@kernel.org \
    --to=puranjay@kernel.org \
    --cc=andrii@kernel.org \
    --cc=ast@kernel.org \
    --cc=bot+bpf-ci@kernel.org \
    --cc=bpf@vger.kernel.org \
    --cc=clm@meta.com \
    --cc=daniel@iogearbox.net \
    --cc=eddyz87@gmail.com \
    --cc=ihor.solodrai@linux.dev \
    --cc=kernel-team@meta.com \
    --cc=martin.lau@kernel.org \
    --cc=memxor@gmail.com \
    --cc=yonghong.song@linux.dev \
    /path/to/YOUR_REPLY

  https://kernel.org/pub/software/scm/git/docs/git-send-email.html

* If your mail client supports setting the In-Reply-To header
  via mailto: links, try the mailto: link
Be sure your reply has a Subject: header at the top and a blank line before the message body.
This is a public inbox, see mirroring instructions
for how to clone and mirror all data and code used for this inbox