BPF List
 help / color / mirror / Atom feed
* [PATCH bpf-next v2 0/2] memcg accounting for BPF arena
@ 2025-12-31 14:14 Puranjay Mohan
  2025-12-31 14:14 ` [PATCH bpf-next v2 1/2] bpf: syscall: Introduce memcg enter/exit helpers Puranjay Mohan
  2025-12-31 14:14 ` [PATCH bpf-next v2 2/2] bpf: arena: Reintroduce memcg accounting Puranjay Mohan
  0 siblings, 2 replies; 5+ messages in thread
From: Puranjay Mohan @ 2025-12-31 14:14 UTC (permalink / raw)
  To: bpf
  Cc: Puranjay Mohan, Puranjay Mohan, Alexei Starovoitov,
	Andrii Nakryiko, Daniel Borkmann, Martin KaFai Lau,
	Eduard Zingerman, Kumar Kartikeya Dwivedi, kernel-team

v1: https://lore.kernel.org/all/20251230153006.1347742-1-puranjay@kernel.org/
Changes in v1->v2:
- Return both pointers through arguments from bpf_map_memcg_enter and
  make it return void. (Alexei)
- Add memcg accounting in arena_free_worker (AI)

This set adds memcg accounting logic into arena kfuncs and other places
that do allocations in arena.c.

Puranjay Mohan (2):
  bpf: syscall: Introduce memcg enter/exit helpers
  bpf: arena: Reintroduce memcg accounting

 include/linux/bpf.h     | 15 +++++++++++++
 kernel/bpf/arena.c      | 44 +++++++++++++++++++++++++++++++-----
 kernel/bpf/range_tree.c |  5 +++--
 kernel/bpf/syscall.c    | 50 +++++++++++++++++++++--------------------
 4 files changed, 83 insertions(+), 31 deletions(-)


base-commit: ccaa6d2c9635a8db06a494d67ef123b56b967a78
-- 
2.47.3


^ permalink raw reply	[flat|nested] 5+ messages in thread

* [PATCH bpf-next v2 1/2] bpf: syscall: Introduce memcg enter/exit helpers
  2025-12-31 14:14 [PATCH bpf-next v2 0/2] memcg accounting for BPF arena Puranjay Mohan
@ 2025-12-31 14:14 ` Puranjay Mohan
  2025-12-31 14:14 ` [PATCH bpf-next v2 2/2] bpf: arena: Reintroduce memcg accounting Puranjay Mohan
  1 sibling, 0 replies; 5+ messages in thread
From: Puranjay Mohan @ 2025-12-31 14:14 UTC (permalink / raw)
  To: bpf
  Cc: Puranjay Mohan, Puranjay Mohan, Alexei Starovoitov,
	Andrii Nakryiko, Daniel Borkmann, Martin KaFai Lau,
	Eduard Zingerman, Kumar Kartikeya Dwivedi, kernel-team

Introduce bpf_map_memcg_enter() and bpf_map_memcg_exit() helpers to
reduce code duplication in memcg context management.

bpf_map_memcg_enter() gets the memcg from the map, sets it as active,
and returns both the previous and the now active memcg.

bpf_map_memcg_exit() restores the previous active memcg and releases the
reference obtained during enter.

Signed-off-by: Puranjay Mohan <puranjay@kernel.org>
---
 include/linux/bpf.h  | 15 +++++++++++++
 kernel/bpf/syscall.c | 50 +++++++++++++++++++++++---------------------
 2 files changed, 41 insertions(+), 24 deletions(-)

diff --git a/include/linux/bpf.h b/include/linux/bpf.h
index 4e7d72dfbcd4..24a32b1043d1 100644
--- a/include/linux/bpf.h
+++ b/include/linux/bpf.h
@@ -2608,6 +2608,10 @@ struct bpf_prog *bpf_prog_get_curr_or_next(u32 *id);
 int bpf_map_alloc_pages(const struct bpf_map *map, int nid,
 			unsigned long nr_pages, struct page **page_array);
 #ifdef CONFIG_MEMCG
+void bpf_map_memcg_enter(const struct bpf_map *map, struct mem_cgroup **old_memcg,
+			 struct mem_cgroup **new_memcg);
+void bpf_map_memcg_exit(struct mem_cgroup *old_memcg,
+			struct mem_cgroup *memcg);
 void *bpf_map_kmalloc_node(const struct bpf_map *map, size_t size, gfp_t flags,
 			   int node);
 void *bpf_map_kmalloc_nolock(const struct bpf_map *map, size_t size, gfp_t flags,
@@ -2632,6 +2636,17 @@ void __percpu *bpf_map_alloc_percpu(const struct bpf_map *map, size_t size,
 		kvcalloc(_n, _size, _flags)
 #define bpf_map_alloc_percpu(_map, _size, _align, _flags)	\
 		__alloc_percpu_gfp(_size, _align, _flags)
+static inline void bpf_map_memcg_enter(const struct bpf_map *map, struct mem_cgroup **old_memcg,
+				       struct mem_cgroup **new_memcg)
+{
+	*new_memcg = NULL;
+	*old_memcg = NULL;
+}
+
+static inline void bpf_map_memcg_exit(struct mem_cgroup *old_memcg,
+				      struct mem_cgroup *memcg)
+{
+}
 #endif
 
 static inline int
diff --git a/kernel/bpf/syscall.c b/kernel/bpf/syscall.c
index a4d38272d8bc..63849a06518b 100644
--- a/kernel/bpf/syscall.c
+++ b/kernel/bpf/syscall.c
@@ -505,17 +505,29 @@ static struct mem_cgroup *bpf_map_get_memcg(const struct bpf_map *map)
 	return root_mem_cgroup;
 }
 
+void bpf_map_memcg_enter(const struct bpf_map *map, struct mem_cgroup **old_memcg,
+			 struct mem_cgroup **new_memcg)
+{
+	*new_memcg = bpf_map_get_memcg(map);
+	*old_memcg = set_active_memcg(*new_memcg);
+}
+
+void bpf_map_memcg_exit(struct mem_cgroup *old_memcg,
+			struct mem_cgroup *new_memcg)
+{
+	set_active_memcg(old_memcg);
+	mem_cgroup_put(new_memcg);
+}
+
 void *bpf_map_kmalloc_node(const struct bpf_map *map, size_t size, gfp_t flags,
 			   int node)
 {
 	struct mem_cgroup *memcg, *old_memcg;
 	void *ptr;
 
-	memcg = bpf_map_get_memcg(map);
-	old_memcg = set_active_memcg(memcg);
+	bpf_map_memcg_enter(map, &old_memcg, &memcg);
 	ptr = kmalloc_node(size, flags | __GFP_ACCOUNT, node);
-	set_active_memcg(old_memcg);
-	mem_cgroup_put(memcg);
+	bpf_map_memcg_exit(old_memcg, memcg);
 
 	return ptr;
 }
@@ -526,11 +538,9 @@ void *bpf_map_kmalloc_nolock(const struct bpf_map *map, size_t size, gfp_t flags
 	struct mem_cgroup *memcg, *old_memcg;
 	void *ptr;
 
-	memcg = bpf_map_get_memcg(map);
-	old_memcg = set_active_memcg(memcg);
+	bpf_map_memcg_enter(map, &old_memcg, &memcg);
 	ptr = kmalloc_nolock(size, flags | __GFP_ACCOUNT, node);
-	set_active_memcg(old_memcg);
-	mem_cgroup_put(memcg);
+	bpf_map_memcg_exit(old_memcg, memcg);
 
 	return ptr;
 }
@@ -540,11 +550,9 @@ void *bpf_map_kzalloc(const struct bpf_map *map, size_t size, gfp_t flags)
 	struct mem_cgroup *memcg, *old_memcg;
 	void *ptr;
 
-	memcg = bpf_map_get_memcg(map);
-	old_memcg = set_active_memcg(memcg);
+	bpf_map_memcg_enter(map, &old_memcg, &memcg);
 	ptr = kzalloc(size, flags | __GFP_ACCOUNT);
-	set_active_memcg(old_memcg);
-	mem_cgroup_put(memcg);
+	bpf_map_memcg_exit(old_memcg, memcg);
 
 	return ptr;
 }
@@ -555,11 +563,9 @@ void *bpf_map_kvcalloc(struct bpf_map *map, size_t n, size_t size,
 	struct mem_cgroup *memcg, *old_memcg;
 	void *ptr;
 
-	memcg = bpf_map_get_memcg(map);
-	old_memcg = set_active_memcg(memcg);
+	bpf_map_memcg_enter(map, &old_memcg, &memcg);
 	ptr = kvcalloc(n, size, flags | __GFP_ACCOUNT);
-	set_active_memcg(old_memcg);
-	mem_cgroup_put(memcg);
+	bpf_map_memcg_exit(old_memcg, memcg);
 
 	return ptr;
 }
@@ -570,11 +576,9 @@ void __percpu *bpf_map_alloc_percpu(const struct bpf_map *map, size_t size,
 	struct mem_cgroup *memcg, *old_memcg;
 	void __percpu *ptr;
 
-	memcg = bpf_map_get_memcg(map);
-	old_memcg = set_active_memcg(memcg);
+	bpf_map_memcg_enter(map, &old_memcg, &memcg);
 	ptr = __alloc_percpu_gfp(size, align, flags | __GFP_ACCOUNT);
-	set_active_memcg(old_memcg);
-	mem_cgroup_put(memcg);
+	bpf_map_memcg_exit(old_memcg, memcg);
 
 	return ptr;
 }
@@ -615,8 +619,7 @@ int bpf_map_alloc_pages(const struct bpf_map *map, int nid,
 #ifdef CONFIG_MEMCG
 	struct mem_cgroup *memcg, *old_memcg;
 
-	memcg = bpf_map_get_memcg(map);
-	old_memcg = set_active_memcg(memcg);
+	bpf_map_memcg_enter(map, &old_memcg, &memcg);
 #endif
 	for (i = 0; i < nr_pages; i++) {
 		pg = __bpf_alloc_page(nid);
@@ -632,8 +635,7 @@ int bpf_map_alloc_pages(const struct bpf_map *map, int nid,
 	}
 
 #ifdef CONFIG_MEMCG
-	set_active_memcg(old_memcg);
-	mem_cgroup_put(memcg);
+	bpf_map_memcg_exit(old_memcg, memcg);
 #endif
 	return ret;
 }
-- 
2.47.3


^ permalink raw reply related	[flat|nested] 5+ messages in thread

* [PATCH bpf-next v2 2/2] bpf: arena: Reintroduce memcg accounting
  2025-12-31 14:14 [PATCH bpf-next v2 0/2] memcg accounting for BPF arena Puranjay Mohan
  2025-12-31 14:14 ` [PATCH bpf-next v2 1/2] bpf: syscall: Introduce memcg enter/exit helpers Puranjay Mohan
@ 2025-12-31 14:14 ` Puranjay Mohan
  2025-12-31 17:25   ` Alexei Starovoitov
  1 sibling, 1 reply; 5+ messages in thread
From: Puranjay Mohan @ 2025-12-31 14:14 UTC (permalink / raw)
  To: bpf
  Cc: Puranjay Mohan, Puranjay Mohan, Alexei Starovoitov,
	Andrii Nakryiko, Daniel Borkmann, Martin KaFai Lau,
	Eduard Zingerman, Kumar Kartikeya Dwivedi, kernel-team

When arena allocations were converted from bpf_map_alloc_pages() to
kmalloc_nolock() to support non-sleepable contexts, memcg accounting was
inadvertently lost. This commit restores proper memory accounting for
all arena-related allocations.

All arena related allocations are accounted into memcg of the process
that created bpf_arena.

Signed-off-by: Puranjay Mohan <puranjay@kernel.org>
---
 kernel/bpf/arena.c      | 44 ++++++++++++++++++++++++++++++++++++-----
 kernel/bpf/range_tree.c |  5 +++--
 2 files changed, 42 insertions(+), 7 deletions(-)

diff --git a/kernel/bpf/arena.c b/kernel/bpf/arena.c
index 456ac989269d..45b55961683f 100644
--- a/kernel/bpf/arena.c
+++ b/kernel/bpf/arena.c
@@ -360,6 +360,7 @@ static vm_fault_t arena_vm_fault(struct vm_fault *vmf)
 {
 	struct bpf_map *map = vmf->vma->vm_file->private_data;
 	struct bpf_arena *arena = container_of(map, struct bpf_arena, map);
+	struct mem_cgroup *new_memcg, *old_memcg;
 	struct page *page;
 	long kbase, kaddr;
 	unsigned long flags;
@@ -377,6 +378,8 @@ static vm_fault_t arena_vm_fault(struct vm_fault *vmf)
 		/* already have a page vmap-ed */
 		goto out;
 
+	bpf_map_memcg_enter(map, &old_memcg, &new_memcg);
+
 	if (arena->map.map_flags & BPF_F_SEGV_ON_FAULT)
 		/* User space requested to segfault when page is not allocated by bpf prog */
 		goto out_unlock_sigsegv;
@@ -400,12 +403,14 @@ static vm_fault_t arena_vm_fault(struct vm_fault *vmf)
 		goto out_unlock_sigsegv;
 	}
 	flush_vmap_cache(kaddr, PAGE_SIZE);
+	bpf_map_memcg_exit(old_memcg, new_memcg);
 out:
 	page_ref_add(page, 1);
 	raw_res_spin_unlock_irqrestore(&arena->spinlock, flags);
 	vmf->page = page;
 	return 0;
 out_unlock_sigsegv:
+	bpf_map_memcg_exit(old_memcg, new_memcg);
 	raw_res_spin_unlock_irqrestore(&arena->spinlock, flags);
 	return VM_FAULT_SIGSEGV;
 }
@@ -557,7 +562,7 @@ static long arena_alloc_pages(struct bpf_arena *arena, long uaddr, long page_cnt
 
 	/* Cap allocation size to KMALLOC_MAX_CACHE_SIZE so kmalloc_nolock() can succeed. */
 	alloc_pages = min(page_cnt, KMALLOC_MAX_CACHE_SIZE / sizeof(struct page *));
-	pages = kmalloc_nolock(alloc_pages * sizeof(struct page *), 0, NUMA_NO_NODE);
+	pages = kmalloc_nolock(alloc_pages * sizeof(struct page *), __GFP_ACCOUNT, NUMA_NO_NODE);
 	if (!pages)
 		return 0;
 	data.pages = pages;
@@ -713,7 +718,7 @@ static void arena_free_pages(struct bpf_arena *arena, long uaddr, long page_cnt,
 	return;
 
 defer:
-	s = kmalloc_nolock(sizeof(struct arena_free_span), 0, -1);
+	s = kmalloc_nolock(sizeof(struct arena_free_span), __GFP_ACCOUNT, -1);
 	if (!s)
 		/*
 		 * If allocation fails in non-sleepable context, pages are intentionally left
@@ -766,6 +771,7 @@ static int arena_reserve_pages(struct bpf_arena *arena, long uaddr, u32 page_cnt
 static void arena_free_worker(struct work_struct *work)
 {
 	struct bpf_arena *arena = container_of(work, struct bpf_arena, free_work);
+	struct mem_cgroup *new_memcg, *old_memcg;
 	struct llist_node *list, *pos, *t;
 	struct arena_free_span *s;
 	u64 arena_vm_start, user_vm_start;
@@ -780,6 +786,8 @@ static void arena_free_worker(struct work_struct *work)
 		return;
 	}
 
+	bpf_map_memcg_enter(&arena->map, &old_memcg, &new_memcg);
+
 	init_llist_head(&free_pages);
 	arena_vm_start = bpf_arena_get_kern_vm_start(arena);
 	user_vm_start = bpf_arena_get_user_vm_start(arena);
@@ -820,6 +828,8 @@ static void arena_free_worker(struct work_struct *work)
 		page = llist_entry(pos, struct page, pcp_llist);
 		__free_page(page);
 	}
+
+	bpf_map_memcg_exit(old_memcg, new_memcg);
 }
 
 static void arena_free_irq(struct irq_work *iw)
@@ -834,49 +844,69 @@ __bpf_kfunc_start_defs();
 __bpf_kfunc void *bpf_arena_alloc_pages(void *p__map, void *addr__ign, u32 page_cnt,
 					int node_id, u64 flags)
 {
+	void *ret;
 	struct bpf_map *map = p__map;
+	struct mem_cgroup *new_memcg, *old_memcg;
 	struct bpf_arena *arena = container_of(map, struct bpf_arena, map);
 
 	if (map->map_type != BPF_MAP_TYPE_ARENA || flags || !page_cnt)
 		return NULL;
 
-	return (void *)arena_alloc_pages(arena, (long)addr__ign, page_cnt, node_id, true);
+	bpf_map_memcg_enter(map, &old_memcg, &new_memcg);
+	ret = (void *)arena_alloc_pages(arena, (long)addr__ign, page_cnt, node_id, true);
+	bpf_map_memcg_exit(old_memcg, new_memcg);
+
+	return ret;
 }
 
 void *bpf_arena_alloc_pages_non_sleepable(void *p__map, void *addr__ign, u32 page_cnt,
 					  int node_id, u64 flags)
 {
+	void *ret;
 	struct bpf_map *map = p__map;
+	struct mem_cgroup *new_memcg, *old_memcg;
 	struct bpf_arena *arena = container_of(map, struct bpf_arena, map);
 
 	if (map->map_type != BPF_MAP_TYPE_ARENA || flags || !page_cnt)
 		return NULL;
 
-	return (void *)arena_alloc_pages(arena, (long)addr__ign, page_cnt, node_id, false);
+	bpf_map_memcg_enter(map, &old_memcg, &new_memcg);
+	ret = (void *)arena_alloc_pages(arena, (long)addr__ign, page_cnt, node_id, false);
+	bpf_map_memcg_exit(old_memcg, new_memcg);
+
+	return ret;
 }
 __bpf_kfunc void bpf_arena_free_pages(void *p__map, void *ptr__ign, u32 page_cnt)
 {
 	struct bpf_map *map = p__map;
+	struct mem_cgroup *new_memcg, *old_memcg;
 	struct bpf_arena *arena = container_of(map, struct bpf_arena, map);
 
 	if (map->map_type != BPF_MAP_TYPE_ARENA || !page_cnt || !ptr__ign)
 		return;
+	bpf_map_memcg_enter(map, &old_memcg, &new_memcg);
 	arena_free_pages(arena, (long)ptr__ign, page_cnt, true);
+	bpf_map_memcg_exit(old_memcg, new_memcg);
 }
 
 void bpf_arena_free_pages_non_sleepable(void *p__map, void *ptr__ign, u32 page_cnt)
 {
 	struct bpf_map *map = p__map;
+	struct mem_cgroup *new_memcg, *old_memcg;
 	struct bpf_arena *arena = container_of(map, struct bpf_arena, map);
 
 	if (map->map_type != BPF_MAP_TYPE_ARENA || !page_cnt || !ptr__ign)
 		return;
+	bpf_map_memcg_enter(map, &old_memcg, &new_memcg);
 	arena_free_pages(arena, (long)ptr__ign, page_cnt, false);
+	bpf_map_memcg_exit(old_memcg, new_memcg);
 }
 
 __bpf_kfunc int bpf_arena_reserve_pages(void *p__map, void *ptr__ign, u32 page_cnt)
 {
+	int ret;
 	struct bpf_map *map = p__map;
+	struct mem_cgroup *new_memcg, *old_memcg;
 	struct bpf_arena *arena = container_of(map, struct bpf_arena, map);
 
 	if (map->map_type != BPF_MAP_TYPE_ARENA)
@@ -885,7 +915,11 @@ __bpf_kfunc int bpf_arena_reserve_pages(void *p__map, void *ptr__ign, u32 page_c
 	if (!page_cnt)
 		return 0;
 
-	return arena_reserve_pages(arena, (long)ptr__ign, page_cnt);
+	bpf_map_memcg_enter(map, &old_memcg, &new_memcg);
+	ret = arena_reserve_pages(arena, (long)ptr__ign, page_cnt);
+	bpf_map_memcg_exit(old_memcg, new_memcg);
+
+	return ret;
 }
 __bpf_kfunc_end_defs();
 
diff --git a/kernel/bpf/range_tree.c b/kernel/bpf/range_tree.c
index 99c63d982c5d..2f28886f3ff7 100644
--- a/kernel/bpf/range_tree.c
+++ b/kernel/bpf/range_tree.c
@@ -149,7 +149,8 @@ int range_tree_clear(struct range_tree *rt, u32 start, u32 len)
 			range_it_insert(rn, rt);
 
 			/* Add a range */
-			new_rn = kmalloc_nolock(sizeof(struct range_node), 0, NUMA_NO_NODE);
+			new_rn = kmalloc_nolock(sizeof(struct range_node), __GFP_ACCOUNT,
+						NUMA_NO_NODE);
 			if (!new_rn)
 				return -ENOMEM;
 			new_rn->rn_start = last + 1;
@@ -234,7 +235,7 @@ int range_tree_set(struct range_tree *rt, u32 start, u32 len)
 		right->rn_start = start;
 		range_it_insert(right, rt);
 	} else {
-		left = kmalloc_nolock(sizeof(struct range_node), 0, NUMA_NO_NODE);
+		left = kmalloc_nolock(sizeof(struct range_node), __GFP_ACCOUNT, NUMA_NO_NODE);
 		if (!left)
 			return -ENOMEM;
 		left->rn_start = start;
-- 
2.47.3


^ permalink raw reply related	[flat|nested] 5+ messages in thread

* Re: [PATCH bpf-next v2 2/2] bpf: arena: Reintroduce memcg accounting
  2025-12-31 14:14 ` [PATCH bpf-next v2 2/2] bpf: arena: Reintroduce memcg accounting Puranjay Mohan
@ 2025-12-31 17:25   ` Alexei Starovoitov
  2025-12-31 17:30     ` Puranjay Mohan
  0 siblings, 1 reply; 5+ messages in thread
From: Alexei Starovoitov @ 2025-12-31 17:25 UTC (permalink / raw)
  To: Puranjay Mohan
  Cc: bpf, Puranjay Mohan, Alexei Starovoitov, Andrii Nakryiko,
	Daniel Borkmann, Martin KaFai Lau, Eduard Zingerman,
	Kumar Kartikeya Dwivedi, Kernel Team

On Wed, Dec 31, 2025 at 6:14 AM Puranjay Mohan <puranjay@kernel.org> wrote:
>
> When arena allocations were converted from bpf_map_alloc_pages() to
> kmalloc_nolock() to support non-sleepable contexts, memcg accounting was
> inadvertently lost. This commit restores proper memory accounting for
> all arena-related allocations.
>
> All arena related allocations are accounted into memcg of the process
> that created bpf_arena.
>
> Signed-off-by: Puranjay Mohan <puranjay@kernel.org>
> ---
>  kernel/bpf/arena.c      | 44 ++++++++++++++++++++++++++++++++++++-----
>  kernel/bpf/range_tree.c |  5 +++--
>  2 files changed, 42 insertions(+), 7 deletions(-)
>
> diff --git a/kernel/bpf/arena.c b/kernel/bpf/arena.c
> index 456ac989269d..45b55961683f 100644
> --- a/kernel/bpf/arena.c
> +++ b/kernel/bpf/arena.c
> @@ -360,6 +360,7 @@ static vm_fault_t arena_vm_fault(struct vm_fault *vmf)
>  {
>         struct bpf_map *map = vmf->vma->vm_file->private_data;
>         struct bpf_arena *arena = container_of(map, struct bpf_arena, map);
> +       struct mem_cgroup *new_memcg, *old_memcg;
>         struct page *page;
>         long kbase, kaddr;
>         unsigned long flags;
> @@ -377,6 +378,8 @@ static vm_fault_t arena_vm_fault(struct vm_fault *vmf)
>                 /* already have a page vmap-ed */
>                 goto out;
>
> +       bpf_map_memcg_enter(map, &old_memcg, &new_memcg);
> +
>         if (arena->map.map_flags & BPF_F_SEGV_ON_FAULT)
>                 /* User space requested to segfault when page is not allocated by bpf prog */
>                 goto out_unlock_sigsegv;
> @@ -400,12 +403,14 @@ static vm_fault_t arena_vm_fault(struct vm_fault *vmf)
>                 goto out_unlock_sigsegv;
>         }
>         flush_vmap_cache(kaddr, PAGE_SIZE);
> +       bpf_map_memcg_exit(old_memcg, new_memcg);
>  out:
>         page_ref_add(page, 1);
>         raw_res_spin_unlock_irqrestore(&arena->spinlock, flags);
>         vmf->page = page;
>         return 0;
>  out_unlock_sigsegv:
> +       bpf_map_memcg_exit(old_memcg, new_memcg);
>         raw_res_spin_unlock_irqrestore(&arena->spinlock, flags);
>         return VM_FAULT_SIGSEGV;
>  }
> @@ -557,7 +562,7 @@ static long arena_alloc_pages(struct bpf_arena *arena, long uaddr, long page_cnt
>
>         /* Cap allocation size to KMALLOC_MAX_CACHE_SIZE so kmalloc_nolock() can succeed. */
>         alloc_pages = min(page_cnt, KMALLOC_MAX_CACHE_SIZE / sizeof(struct page *));
> -       pages = kmalloc_nolock(alloc_pages * sizeof(struct page *), 0, NUMA_NO_NODE);
> +       pages = kmalloc_nolock(alloc_pages * sizeof(struct page *), __GFP_ACCOUNT, NUMA_NO_NODE);
>         if (!pages)
>                 return 0;
>         data.pages = pages;
> @@ -713,7 +718,7 @@ static void arena_free_pages(struct bpf_arena *arena, long uaddr, long page_cnt,
>         return;
>
>  defer:
> -       s = kmalloc_nolock(sizeof(struct arena_free_span), 0, -1);
> +       s = kmalloc_nolock(sizeof(struct arena_free_span), __GFP_ACCOUNT, -1);
>         if (!s)
>                 /*
>                  * If allocation fails in non-sleepable context, pages are intentionally left
> @@ -766,6 +771,7 @@ static int arena_reserve_pages(struct bpf_arena *arena, long uaddr, u32 page_cnt
>  static void arena_free_worker(struct work_struct *work)
>  {
>         struct bpf_arena *arena = container_of(work, struct bpf_arena, free_work);
> +       struct mem_cgroup *new_memcg, *old_memcg;
>         struct llist_node *list, *pos, *t;
>         struct arena_free_span *s;
>         u64 arena_vm_start, user_vm_start;
> @@ -780,6 +786,8 @@ static void arena_free_worker(struct work_struct *work)
>                 return;
>         }
>
> +       bpf_map_memcg_enter(&arena->map, &old_memcg, &new_memcg);
> +
>         init_llist_head(&free_pages);
>         arena_vm_start = bpf_arena_get_kern_vm_start(arena);
>         user_vm_start = bpf_arena_get_user_vm_start(arena);
> @@ -820,6 +828,8 @@ static void arena_free_worker(struct work_struct *work)
>                 page = llist_entry(pos, struct page, pcp_llist);
>                 __free_page(page);
>         }
> +
> +       bpf_map_memcg_exit(old_memcg, new_memcg);
>  }
>
>  static void arena_free_irq(struct irq_work *iw)
> @@ -834,49 +844,69 @@ __bpf_kfunc_start_defs();
>  __bpf_kfunc void *bpf_arena_alloc_pages(void *p__map, void *addr__ign, u32 page_cnt,
>                                         int node_id, u64 flags)
>  {
> +       void *ret;
>         struct bpf_map *map = p__map;
> +       struct mem_cgroup *new_memcg, *old_memcg;
>         struct bpf_arena *arena = container_of(map, struct bpf_arena, map);
>
>         if (map->map_type != BPF_MAP_TYPE_ARENA || flags || !page_cnt)
>                 return NULL;
>
> -       return (void *)arena_alloc_pages(arena, (long)addr__ign, page_cnt, node_id, true);
> +       bpf_map_memcg_enter(map, &old_memcg, &new_memcg);
> +       ret = (void *)arena_alloc_pages(arena, (long)addr__ign, page_cnt, node_id, true);
> +       bpf_map_memcg_exit(old_memcg, new_memcg);

Should this be done inside arena_alloc_pages() ?

and similar inside arena_free_pages() ?
Doesn't look to me that the error path will get more complex.
just memcg_enter/exit in a few places.
That will avoid copy paste in sleepable/non_sleepable kfuncs.

Also remove redundant memcg_enter/exit from bpf_map_alloc_pages().
No point in doing it twice.
and remove #ifdef in patch 1.

pw-bot: cr

^ permalink raw reply	[flat|nested] 5+ messages in thread

* Re: [PATCH bpf-next v2 2/2] bpf: arena: Reintroduce memcg accounting
  2025-12-31 17:25   ` Alexei Starovoitov
@ 2025-12-31 17:30     ` Puranjay Mohan
  0 siblings, 0 replies; 5+ messages in thread
From: Puranjay Mohan @ 2025-12-31 17:30 UTC (permalink / raw)
  To: Alexei Starovoitov
  Cc: bpf, Alexei Starovoitov, Andrii Nakryiko, Daniel Borkmann,
	Martin KaFai Lau, Eduard Zingerman, Kumar Kartikeya Dwivedi,
	Kernel Team

On Wed, Dec 31, 2025 at 5:25 PM Alexei Starovoitov
<alexei.starovoitov@gmail.com> wrote:
>
> On Wed, Dec 31, 2025 at 6:14 AM Puranjay Mohan <puranjay@kernel.org> wrote:
> >
> > When arena allocations were converted from bpf_map_alloc_pages() to
> > kmalloc_nolock() to support non-sleepable contexts, memcg accounting was
> > inadvertently lost. This commit restores proper memory accounting for
> > all arena-related allocations.
> >
> > All arena related allocations are accounted into memcg of the process
> > that created bpf_arena.
> >
> > Signed-off-by: Puranjay Mohan <puranjay@kernel.org>
> > ---
> >  kernel/bpf/arena.c      | 44 ++++++++++++++++++++++++++++++++++++-----
> >  kernel/bpf/range_tree.c |  5 +++--
> >  2 files changed, 42 insertions(+), 7 deletions(-)
> >
> > diff --git a/kernel/bpf/arena.c b/kernel/bpf/arena.c
> > index 456ac989269d..45b55961683f 100644
> > --- a/kernel/bpf/arena.c
> > +++ b/kernel/bpf/arena.c
> > @@ -360,6 +360,7 @@ static vm_fault_t arena_vm_fault(struct vm_fault *vmf)
> >  {
> >         struct bpf_map *map = vmf->vma->vm_file->private_data;
> >         struct bpf_arena *arena = container_of(map, struct bpf_arena, map);
> > +       struct mem_cgroup *new_memcg, *old_memcg;
> >         struct page *page;
> >         long kbase, kaddr;
> >         unsigned long flags;
> > @@ -377,6 +378,8 @@ static vm_fault_t arena_vm_fault(struct vm_fault *vmf)
> >                 /* already have a page vmap-ed */
> >                 goto out;
> >
> > +       bpf_map_memcg_enter(map, &old_memcg, &new_memcg);
> > +
> >         if (arena->map.map_flags & BPF_F_SEGV_ON_FAULT)
> >                 /* User space requested to segfault when page is not allocated by bpf prog */
> >                 goto out_unlock_sigsegv;
> > @@ -400,12 +403,14 @@ static vm_fault_t arena_vm_fault(struct vm_fault *vmf)
> >                 goto out_unlock_sigsegv;
> >         }
> >         flush_vmap_cache(kaddr, PAGE_SIZE);
> > +       bpf_map_memcg_exit(old_memcg, new_memcg);
> >  out:
> >         page_ref_add(page, 1);
> >         raw_res_spin_unlock_irqrestore(&arena->spinlock, flags);
> >         vmf->page = page;
> >         return 0;
> >  out_unlock_sigsegv:
> > +       bpf_map_memcg_exit(old_memcg, new_memcg);
> >         raw_res_spin_unlock_irqrestore(&arena->spinlock, flags);
> >         return VM_FAULT_SIGSEGV;
> >  }
> > @@ -557,7 +562,7 @@ static long arena_alloc_pages(struct bpf_arena *arena, long uaddr, long page_cnt
> >
> >         /* Cap allocation size to KMALLOC_MAX_CACHE_SIZE so kmalloc_nolock() can succeed. */
> >         alloc_pages = min(page_cnt, KMALLOC_MAX_CACHE_SIZE / sizeof(struct page *));
> > -       pages = kmalloc_nolock(alloc_pages * sizeof(struct page *), 0, NUMA_NO_NODE);
> > +       pages = kmalloc_nolock(alloc_pages * sizeof(struct page *), __GFP_ACCOUNT, NUMA_NO_NODE);
> >         if (!pages)
> >                 return 0;
> >         data.pages = pages;
> > @@ -713,7 +718,7 @@ static void arena_free_pages(struct bpf_arena *arena, long uaddr, long page_cnt,
> >         return;
> >
> >  defer:
> > -       s = kmalloc_nolock(sizeof(struct arena_free_span), 0, -1);
> > +       s = kmalloc_nolock(sizeof(struct arena_free_span), __GFP_ACCOUNT, -1);
> >         if (!s)
> >                 /*
> >                  * If allocation fails in non-sleepable context, pages are intentionally left
> > @@ -766,6 +771,7 @@ static int arena_reserve_pages(struct bpf_arena *arena, long uaddr, u32 page_cnt
> >  static void arena_free_worker(struct work_struct *work)
> >  {
> >         struct bpf_arena *arena = container_of(work, struct bpf_arena, free_work);
> > +       struct mem_cgroup *new_memcg, *old_memcg;
> >         struct llist_node *list, *pos, *t;
> >         struct arena_free_span *s;
> >         u64 arena_vm_start, user_vm_start;
> > @@ -780,6 +786,8 @@ static void arena_free_worker(struct work_struct *work)
> >                 return;
> >         }
> >
> > +       bpf_map_memcg_enter(&arena->map, &old_memcg, &new_memcg);
> > +
> >         init_llist_head(&free_pages);
> >         arena_vm_start = bpf_arena_get_kern_vm_start(arena);
> >         user_vm_start = bpf_arena_get_user_vm_start(arena);
> > @@ -820,6 +828,8 @@ static void arena_free_worker(struct work_struct *work)
> >                 page = llist_entry(pos, struct page, pcp_llist);
> >                 __free_page(page);
> >         }
> > +
> > +       bpf_map_memcg_exit(old_memcg, new_memcg);
> >  }
> >
> >  static void arena_free_irq(struct irq_work *iw)
> > @@ -834,49 +844,69 @@ __bpf_kfunc_start_defs();
> >  __bpf_kfunc void *bpf_arena_alloc_pages(void *p__map, void *addr__ign, u32 page_cnt,
> >                                         int node_id, u64 flags)
> >  {
> > +       void *ret;
> >         struct bpf_map *map = p__map;
> > +       struct mem_cgroup *new_memcg, *old_memcg;
> >         struct bpf_arena *arena = container_of(map, struct bpf_arena, map);
> >
> >         if (map->map_type != BPF_MAP_TYPE_ARENA || flags || !page_cnt)
> >                 return NULL;
> >
> > -       return (void *)arena_alloc_pages(arena, (long)addr__ign, page_cnt, node_id, true);
> > +       bpf_map_memcg_enter(map, &old_memcg, &new_memcg);
> > +       ret = (void *)arena_alloc_pages(arena, (long)addr__ign, page_cnt, node_id, true);
> > +       bpf_map_memcg_exit(old_memcg, new_memcg);
>
> Should this be done inside arena_alloc_pages() ?
>
> and similar inside arena_free_pages() ?
> Doesn't look to me that the error path will get more complex.
> just memcg_enter/exit in a few places.
> That will avoid copy paste in sleepable/non_sleepable kfuncs.

I thought it will be more readable here as the
arena_alloc/free_pages() is already complicated.

> Also remove redundant memcg_enter/exit from bpf_map_alloc_pages().
> No point in doing it twice.
> and remove #ifdef in patch 1.

Yes, now I see that there is no other user of that function.

^ permalink raw reply	[flat|nested] 5+ messages in thread

end of thread, other threads:[~2025-12-31 17:31 UTC | newest]

Thread overview: 5+ messages (download: mbox.gz follow: Atom feed
-- links below jump to the message on this page --
2025-12-31 14:14 [PATCH bpf-next v2 0/2] memcg accounting for BPF arena Puranjay Mohan
2025-12-31 14:14 ` [PATCH bpf-next v2 1/2] bpf: syscall: Introduce memcg enter/exit helpers Puranjay Mohan
2025-12-31 14:14 ` [PATCH bpf-next v2 2/2] bpf: arena: Reintroduce memcg accounting Puranjay Mohan
2025-12-31 17:25   ` Alexei Starovoitov
2025-12-31 17:30     ` Puranjay Mohan

This is a public inbox, see mirroring instructions
for how to clone and mirror all data and code used for this inbox