BPF List
 help / color / mirror / Atom feed
* [PATCH bpf] bpf/helpers: Skip memcg accounting in __bpf_async_init()
@ 2025-09-05  6:19 Peilin Ye
  2025-09-05  6:28 ` Peilin Ye
                   ` (2 more replies)
  0 siblings, 3 replies; 10+ messages in thread
From: Peilin Ye @ 2025-09-05  6:19 UTC (permalink / raw)
  To: bpf
  Cc: Peilin Ye, Alexei Starovoitov, Daniel Borkmann, Andrii Nakryiko,
	Martin KaFai Lau, Eduard Zingerman, Song Liu, Yonghong Song,
	John Fastabend, KP Singh, Stanislav Fomichev, Hao Luo, Jiri Olsa,
	Kumar Kartikeya Dwivedi, Josh Don, Barret Rhoden

Calling bpf_map_kmalloc_node() from __bpf_async_init() can cause various
locking issues; see the following stack trace (edited for style) as one
example:

...
 [10.011566]  do_raw_spin_lock.cold
 [10.011570]  try_to_wake_up             (5) double-acquiring the same
 [10.011575]  kick_pool                      rq_lock, causing a hardlockup
 [10.011579]  __queue_work
 [10.011582]  queue_work_on
 [10.011585]  kernfs_notify
 [10.011589]  cgroup_file_notify
 [10.011593]  try_charge_memcg           (4) memcg accounting raises an
 [10.011597]  obj_cgroup_charge_pages        MEMCG_MAX event
 [10.011599]  obj_cgroup_charge_account
 [10.011600]  __memcg_slab_post_alloc_hook
 [10.011603]  __kmalloc_node_noprof
...
 [10.011611]  bpf_map_kmalloc_node
 [10.011612]  __bpf_async_init
 [10.011615]  bpf_timer_init             (3) BPF calls bpf_timer_init()
 [10.011617]  bpf_prog_xxxxxxxxxxxxxxxx_fcg_runnable
 [10.011619]  bpf__sched_ext_ops_runnable
 [10.011620]  enqueue_task_scx           (2) BPF runs with rq_lock held
 [10.011622]  enqueue_task
 [10.011626]  ttwu_do_activate
 [10.011629]  sched_ttwu_pending         (1) grabs rq_lock
...

The above was reproduced on bpf-next (b338cf849ec8) by modifying
./tools/sched_ext/scx_flatcg.bpf.c to call bpf_timer_init() during
ops.runnable(), and hacking [1] the memcg accounting code a bit to make
it (much more likely to) raise an MEMCG_MAX event from a
bpf_timer_init() call.

We have also run into other similar variants both internally (without
applying the [1] hack) and on bpf-next, including:

 * run_timer_softirq() -> cgroup_file_notify()
   (grabs cgroup_file_kn_lock) -> try_to_wake_up() ->
   BPF calls bpf_timer_init() -> bpf_map_kmalloc_node() ->
   try_charge_memcg() raises MEMCG_MAX ->
   cgroup_file_notify() (tries to grab cgroup_file_kn_lock again)

 * __queue_work() (grabs worker_pool::lock) -> try_to_wake_up() ->
   BPF calls bpf_timer_init() -> bpf_map_kmalloc_node() ->
   try_charge_memcg() raises MEMCG_MAX -> cgroup_file_notify() ->
   __queue_work() (tries to grab the same worker_pool::lock)
 ...

As pointed out by Kumar, we can use bpf_mem_alloc() and friends for
bpf_hrtimer and bpf_work, to skip memcg accounting.

Tested with vmtest.sh (llvm-18, x86-64):

 $ ./test_progs -a '*timer*' -a '*wq*'
...
 Summary: 7/12 PASSED, 0 SKIPPED, 0 FAILED

[1] Making a bpf_timer_init() call (much more likely) to raise an
MEMCG_MAX event (gist-only, for brevity):

kernel/bpf/helpers.c:__bpf_async_init():
          /* allocate hrtimer via map_kmalloc to use memcg accounting */
 -        cb = bpf_map_kmalloc_node(map, size, GFP_ATOMIC, map->numa_node);
 +        cb = bpf_map_kmalloc_node(map, size, GFP_ATOMIC | __GFP_HACK,
 +                                  map->numa_node);

mm/memcontrol.c:try_charge_memcg():
          if (!do_memsw_account() ||
 -            page_counter_try_charge(&memcg->memsw, batch, &counter)) {
 -                if (page_counter_try_charge(&memcg->memory, batch, &counter))
 +            page_counter_try_charge_hack(&memcg->memsw, batch, &counter,
 +                                         gfp_mask & __GFP_HACK)) {
 +                if (page_counter_try_charge_hack(&memcg->memory, batch,
 +                                                 &counter,
 +                                                 gfp_mask & __GFP_HACK))
                          goto done_restock;

mm/page_counter.c:page_counter_try_charge():
 -bool page_counter_try_charge(struct page_counter *counter,
 -                             unsigned long nr_pages,
 -                             struct page_counter **fail)
 +bool page_counter_try_charge_hack(struct page_counter *counter,
 +                                  unsigned long nr_pages,
 +                                  struct page_counter **fail, bool hack)
 {
...
 -                if (new > c->max) {
 +                if (hack || new > c->max) {     // goto failed;
                          atomic_long_sub(nr_pages, &c->usage);
                          /*

Fixes: b00628b1c7d5 ("bpf: Introduce bpf timers.")
Suggested-by: Kumar Kartikeya Dwivedi <memxor@gmail.com>
Signed-off-by: Peilin Ye <yepeilin@google.com>
---
 kernel/bpf/helpers.c | 20 ++++++++------------
 1 file changed, 8 insertions(+), 12 deletions(-)

diff --git a/kernel/bpf/helpers.c b/kernel/bpf/helpers.c
index e3a2662f4e33..f7f3c6fb59ee 100644
--- a/kernel/bpf/helpers.c
+++ b/kernel/bpf/helpers.c
@@ -1085,10 +1085,7 @@ struct bpf_async_cb {
 	struct bpf_prog *prog;
 	void __rcu *callback_fn;
 	void *value;
-	union {
-		struct rcu_head rcu;
-		struct work_struct delete_work;
-	};
+	struct work_struct delete_work;
 	u64 flags;
 };
 
@@ -1221,7 +1218,7 @@ static void bpf_wq_delete_work(struct work_struct *work)
 
 	cancel_work_sync(&w->work);
 
-	kfree_rcu(w, cb.rcu);
+	bpf_mem_free_rcu(&bpf_global_ma, w);
 }
 
 static void bpf_timer_delete_work(struct work_struct *work)
@@ -1230,13 +1227,13 @@ static void bpf_timer_delete_work(struct work_struct *work)
 
 	/* Cancel the timer and wait for callback to complete if it was running.
 	 * If hrtimer_cancel() can be safely called it's safe to call
-	 * kfree_rcu(t) right after for both preallocated and non-preallocated
+	 * bpf_mem_free_rcu(t) right after for both preallocated and non-preallocated
 	 * maps.  The async->cb = NULL was already done and no code path can see
 	 * address 't' anymore. Timer if armed for existing bpf_hrtimer before
 	 * bpf_timer_cancel_and_free will have been cancelled.
 	 */
 	hrtimer_cancel(&t->timer);
-	kfree_rcu(t, cb.rcu);
+	bpf_mem_free_rcu(&bpf_global_ma, t);
 }
 
 static int __bpf_async_init(struct bpf_async_kern *async, struct bpf_map *map, u64 flags,
@@ -1270,8 +1267,7 @@ static int __bpf_async_init(struct bpf_async_kern *async, struct bpf_map *map, u
 		goto out;
 	}
 
-	/* allocate hrtimer via map_kmalloc to use memcg accounting */
-	cb = bpf_map_kmalloc_node(map, size, GFP_ATOMIC, map->numa_node);
+	cb = bpf_mem_alloc(&bpf_global_ma, size);
 	if (!cb) {
 		ret = -ENOMEM;
 		goto out;
@@ -1567,7 +1563,7 @@ void bpf_timer_cancel_and_free(void *val)
 	 * callback_fn. In such case we don't call hrtimer_cancel() (since it
 	 * will deadlock) and don't call hrtimer_try_to_cancel() (since it will
 	 * just return -1). Though callback_fn is still running on this cpu it's
-	 * safe to do kfree(t) because bpf_timer_cb() read everything it needed
+	 * safe to free 't' because bpf_timer_cb() read everything it needed
 	 * from 't'. The bpf subprog callback_fn won't be able to access 't',
 	 * since async->cb = NULL was already done. The timer will be
 	 * effectively cancelled because bpf_timer_cb() will return
@@ -1577,7 +1573,7 @@ void bpf_timer_cancel_and_free(void *val)
 	 * timer _before_ calling us, such that failing to cancel it here will
 	 * cause it to possibly use struct hrtimer after freeing bpf_hrtimer.
 	 * Therefore, we _need_ to cancel any outstanding timers before we do
-	 * kfree_rcu, even though no more timers can be armed.
+	 * bpf_mem_free_rcu(), even though no more timers can be armed.
 	 *
 	 * Moreover, we need to schedule work even if timer does not belong to
 	 * the calling callback_fn, as on two different CPUs, we can end up in a
@@ -1604,7 +1600,7 @@ void bpf_timer_cancel_and_free(void *val)
 		 * completion.
 		 */
 		if (hrtimer_try_to_cancel(&t->timer) >= 0)
-			kfree_rcu(t, cb.rcu);
+			bpf_mem_free_rcu(&bpf_global_ma, t);
 		else
 			queue_work(system_unbound_wq, &t->cb.delete_work);
 	} else {
-- 
2.51.0.355.g5224444f11-goog


^ permalink raw reply related	[flat|nested] 10+ messages in thread

end of thread, other threads:[~2025-09-05 23:51 UTC | newest]

Thread overview: 10+ messages (download: mbox.gz follow: Atom feed
-- links below jump to the message on this page --
2025-09-05  6:19 [PATCH bpf] bpf/helpers: Skip memcg accounting in __bpf_async_init() Peilin Ye
2025-09-05  6:28 ` Peilin Ye
2025-09-05 15:18 ` Alexei Starovoitov
2025-09-05 17:31   ` Shakeel Butt
2025-09-05 18:23     ` Peilin Ye
2025-09-05 19:14     ` Peilin Ye
2025-09-05 20:32       ` Shakeel Butt
2025-09-05 19:48     ` Shakeel Butt
2025-09-05 20:31       ` Peilin Ye
2025-09-05 23:51 ` Peilin Ye

This is a public inbox, see mirroring instructions
for how to clone and mirror all data and code used for this inbox