From: Puranjay Mohan <puranjay@kernel.org>
To: bpf@vger.kernel.org
Cc: Puranjay Mohan <puranjay@kernel.org>,
Puranjay Mohan <puranjay12@gmail.com>,
Alexei Starovoitov <ast@kernel.org>,
Andrii Nakryiko <andrii@kernel.org>,
Daniel Borkmann <daniel@iogearbox.net>,
Martin KaFai Lau <martin.lau@kernel.org>,
Eduard Zingerman <eddyz87@gmail.com>,
Kumar Kartikeya Dwivedi <memxor@gmail.com>,
kernel-team@meta.com
Subject: [PATCH bpf-next v2 1/4] bpf: arena: populate vm_area without allocating memory
Date: Fri, 14 Nov 2025 11:16:56 +0000 [thread overview]
Message-ID: <20251114111700.43292-2-puranjay@kernel.org> (raw)
In-Reply-To: <20251114111700.43292-1-puranjay@kernel.org>
vm_area_map_pages() may allocate memory while inserting pages into bpf
arena's vm_area. In order to make bpf_arena_alloc_pages() kfunc
non-sleepable change bpf arena to populate pages without
allocating memory:
- at arena creation time populate all page table levels except
the last level
- when new pages need to be inserted call apply_to_page_range() again
with apply_range_set_cb() which will only set_pte_at() those pages and
will not allocate memory.
- when freeing pages call apply_to_existing_page_range with
apply_range_clear_cb() to clear the pte for the page to be removed. This
doesn't free intermediate page table levels.
Signed-off-by: Puranjay Mohan <puranjay@kernel.org>
---
kernel/bpf/arena.c | 76 ++++++++++++++++++++++++++++++++++++++++++----
1 file changed, 70 insertions(+), 6 deletions(-)
diff --git a/kernel/bpf/arena.c b/kernel/bpf/arena.c
index 1074ac4459f2..48b8ffba3c88 100644
--- a/kernel/bpf/arena.c
+++ b/kernel/bpf/arena.c
@@ -7,6 +7,7 @@
#include <linux/btf_ids.h>
#include <linux/vmalloc.h>
#include <linux/pagemap.h>
+#include <asm/tlbflush.h>
#include "range_tree.h"
/*
@@ -92,6 +93,62 @@ static long compute_pgoff(struct bpf_arena *arena, long uaddr)
return (u32)(uaddr - (u32)arena->user_vm_start) >> PAGE_SHIFT;
}
+struct apply_range_data {
+ struct page **pages;
+ int i;
+};
+
+static int apply_range_set_cb(pte_t *pte, unsigned long addr, void *data)
+{
+ struct apply_range_data *d = data;
+ struct page *page;
+
+ if (!data)
+ return 0;
+ /* sanity check */
+ if (unlikely(!pte_none(ptep_get(pte))))
+ return -EBUSY;
+
+ page = d->pages[d->i++];
+ /* paranoia, similar to vmap_pages_pte_range() */
+ if (WARN_ON_ONCE(!pfn_valid(page_to_pfn(page))))
+ return -EINVAL;
+
+ set_pte_at(&init_mm, addr, pte, mk_pte(page, PAGE_KERNEL));
+ return 0;
+}
+
+static int apply_range_clear_cb(pte_t *pte, unsigned long addr, void *data)
+{
+ pte_t old_pte;
+ struct page *page;
+
+ /* sanity check */
+ old_pte = ptep_get(pte);
+ if (pte_none(old_pte) || !pte_present(old_pte))
+ return 0; /* nothing to do */
+
+ /* get page and free it */
+ page = pte_page(old_pte);
+ if (WARN_ON_ONCE(!page))
+ return -EINVAL;
+
+ pte_clear(&init_mm, addr, pte);
+
+ /* ensure no stale TLB entries */
+ flush_tlb_kernel_range(addr, addr + PAGE_SIZE);
+
+ __free_page(page);
+
+ return 0;
+}
+
+static int populate_pgtable_except_pte(struct bpf_arena *arena)
+{
+ return apply_to_page_range(&init_mm, bpf_arena_get_kern_vm_start(arena),
+ KERN_VM_SZ - GUARD_SZ, apply_range_set_cb, NULL);
+}
+
static struct bpf_map *arena_map_alloc(union bpf_attr *attr)
{
struct vm_struct *kern_vm;
@@ -144,6 +201,11 @@ static struct bpf_map *arena_map_alloc(union bpf_attr *attr)
goto err;
}
mutex_init(&arena->lock);
+ err = populate_pgtable_except_pte(arena);
+ if (err) {
+ bpf_map_area_free(arena);
+ goto err;
+ }
return &arena->map;
err:
@@ -286,6 +348,7 @@ static vm_fault_t arena_vm_fault(struct vm_fault *vmf)
if (ret)
return VM_FAULT_SIGSEGV;
+ struct apply_range_data data = { .pages = &page, .i = 0 };
/* Account into memcg of the process that created bpf_arena */
ret = bpf_map_alloc_pages(map, NUMA_NO_NODE, 1, &page);
if (ret) {
@@ -293,7 +356,7 @@ static vm_fault_t arena_vm_fault(struct vm_fault *vmf)
return VM_FAULT_SIGSEGV;
}
- ret = vm_area_map_pages(arena->kern_vm, kaddr, kaddr + PAGE_SIZE, &page);
+ ret = apply_to_page_range(&init_mm, kaddr, PAGE_SIZE, apply_range_set_cb, &data);
if (ret) {
range_tree_set(&arena->rt, vmf->pgoff, 1);
__free_page(page);
@@ -428,7 +491,7 @@ static long arena_alloc_pages(struct bpf_arena *arena, long uaddr, long page_cnt
/* user_vm_end/start are fixed before bpf prog runs */
long page_cnt_max = (arena->user_vm_end - arena->user_vm_start) >> PAGE_SHIFT;
u64 kern_vm_start = bpf_arena_get_kern_vm_start(arena);
- struct page **pages;
+ struct page **pages = NULL;
long pgoff = 0;
u32 uaddr32;
int ret, i;
@@ -465,6 +528,7 @@ static long arena_alloc_pages(struct bpf_arena *arena, long uaddr, long page_cnt
if (ret)
goto out_free_pages;
+ struct apply_range_data data = { .pages = pages, .i = 0 };
ret = bpf_map_alloc_pages(&arena->map, node_id, page_cnt, pages);
if (ret)
goto out;
@@ -477,8 +541,8 @@ static long arena_alloc_pages(struct bpf_arena *arena, long uaddr, long page_cnt
* kern_vm_start + uaddr32 + page_cnt * PAGE_SIZE - 1 can overflow
* lower 32-bit and it's ok.
*/
- ret = vm_area_map_pages(arena->kern_vm, kern_vm_start + uaddr32,
- kern_vm_start + uaddr32 + page_cnt * PAGE_SIZE, pages);
+ ret = apply_to_page_range(&init_mm, kern_vm_start + uaddr32,
+ page_cnt << PAGE_SHIFT, apply_range_set_cb, &data);
if (ret) {
for (i = 0; i < page_cnt; i++)
__free_page(pages[i]);
@@ -545,8 +609,8 @@ static void arena_free_pages(struct bpf_arena *arena, long uaddr, long page_cnt)
* page_cnt is big it's faster to do the batched zap.
*/
zap_pages(arena, full_uaddr, 1);
- vm_area_unmap_pages(arena->kern_vm, kaddr, kaddr + PAGE_SIZE);
- __free_page(page);
+ apply_to_existing_page_range(&init_mm, kaddr, PAGE_SIZE, apply_range_clear_cb,
+ NULL);
}
}
--
2.47.1
next prev parent reply other threads:[~2025-11-14 11:17 UTC|newest]
Thread overview: 22+ messages / expand[flat|nested] mbox.gz Atom feed top
2025-11-14 11:16 [PATCH bpf-next v2 0/4] Remove KF_SLEEPABLE from arena kfuncs Puranjay Mohan
2025-11-14 11:16 ` Puranjay Mohan [this message]
2025-11-14 11:47 ` [PATCH bpf-next v2 1/4] bpf: arena: populate vm_area without allocating memory bot+bpf-ci
2025-11-14 14:57 ` Puranjay Mohan
2025-11-14 21:21 ` Alexei Starovoitov
2025-11-15 0:52 ` Puranjay Mohan
2025-11-15 1:26 ` Alexei Starovoitov
2025-11-14 11:16 ` [PATCH bpf-next v2 2/4] bpf: arena: use kmalloc_nolock() in place of kvcalloc() Puranjay Mohan
2025-11-14 11:39 ` bot+bpf-ci
2025-11-14 15:13 ` Puranjay Mohan
2025-11-14 21:25 ` Alexei Starovoitov
2025-11-14 11:16 ` [PATCH bpf-next v2 3/4] bpf: arena: make arena kfuncs any context safe Puranjay Mohan
2025-11-14 11:47 ` bot+bpf-ci
2025-11-14 15:28 ` Puranjay Mohan
2025-11-14 21:27 ` Alexei Starovoitov
2025-11-15 0:56 ` Puranjay Mohan
2025-11-15 1:28 ` Alexei Starovoitov
2025-11-15 8:18 ` kernel test robot
2025-11-16 1:15 ` kernel test robot
2025-11-14 11:16 ` [PATCH bpf-next v2 4/4] selftests: bpf: test non-sleepable arena allocations Puranjay Mohan
2025-11-14 22:18 ` Alexei Starovoitov
2025-11-15 0:58 ` Puranjay Mohan
Reply instructions:
You may reply publicly to this message via plain-text email
using any one of the following methods:
* Save the following mbox file, import it into your mail client,
and reply-to-all from there: mbox
Avoid top-posting and favor interleaved quoting:
https://en.wikipedia.org/wiki/Posting_style#Interleaved_style
* Reply using the --to, --cc, and --in-reply-to
switches of git-send-email(1):
git send-email \
--in-reply-to=20251114111700.43292-2-puranjay@kernel.org \
--to=puranjay@kernel.org \
--cc=andrii@kernel.org \
--cc=ast@kernel.org \
--cc=bpf@vger.kernel.org \
--cc=daniel@iogearbox.net \
--cc=eddyz87@gmail.com \
--cc=kernel-team@meta.com \
--cc=martin.lau@kernel.org \
--cc=memxor@gmail.com \
--cc=puranjay12@gmail.com \
/path/to/YOUR_REPLY
https://kernel.org/pub/software/scm/git/docs/git-send-email.html
* If your mail client supports setting the In-Reply-To header
via mailto: links, try the mailto: link
Be sure your reply has a Subject: header at the top and a blank line
before the message body.
This is a public inbox, see mirroring instructions
for how to clone and mirror all data and code used for this inbox