public inbox for linux-kernel@vger.kernel.org
 help / color / mirror / Atom feed
* [PATCH] bpf: Always defer local storage free
@ 2026-03-16 22:27 Andrea Righi
  2026-03-16 23:07 ` bot+bpf-ci
  2026-03-16 23:39 ` Kumar Kartikeya Dwivedi
  0 siblings, 2 replies; 8+ messages in thread
From: Andrea Righi @ 2026-03-16 22:27 UTC (permalink / raw)
  To: Alexei Starovoitov, Daniel Borkmann, Andrii Nakryiko
  Cc: John Fastabend, Martin KaFai Lau, Eduard Zingerman, Song Liu,
	Yonghong Song, KP Singh, Stanislav Fomichev, Hao Luo, Jiri Olsa,
	Amery Hung, Tejun Heo, Emil Tsalapatis, bpf, sched-ext,
	linux-kernel

bpf_task_storage_delete() can be invoked from contexts that hold a raw
spinlock, such as sched_ext's ops.exit_task() callback, that is running
with the rq lock held.

The delete path eventually calls bpf_selem_unlink(), which frees the
element via bpf_selem_free_list() -> bpf_selem_free(). For task storage
with use_kmalloc_nolock, call_rcu_tasks_trace() is used, which is not
safe from raw spinlock context, triggering the following:

 =============================
 [ BUG: Invalid wait context ]
 7.0.0-rc1-virtme #1 Not tainted
 -----------------------------
 (udev-worker)/115 is trying to lock:
 ffffffffa6970dd0 (rcu_tasks_trace_srcu_struct_srcu_usage.lock){....}-{3:3}, at: spin_lock_irqsave_ssp_contention+0x54/0x90
 other info that might help us debug this:
 context-{5:5}
 3 locks held by (udev-worker)/115:
  #0: ffff8e16c634ce58 (&p->pi_lock){-.-.}-{2:2}, at: _task_rq_lock+0x2c/0x100
  #1: ffff8e16fbdbdae0 (&rq->__lock){-.-.}-{2:2}, at: raw_spin_rq_lock_nested+0x24/0xb0
  #2: ffffffffa6971b60 (rcu_read_lock){....}-{1:3}, at: __bpf_prog_enter+0x64/0x110
 ...
 Sched_ext: cosmos_1.0.7_g780e898fc_dirty_x86_64_unknown_linux_gnu (enabled+all), task: runnable_at=-2ms
 Call Trace:
  dump_stack_lvl+0x6f/0xb0
  __lock_acquire+0xf86/0x1de0
  lock_acquire+0xcf/0x310
  _raw_spin_lock_irqsave+0x39/0x60
  spin_lock_irqsave_ssp_contention+0x54/0x90
  srcu_gp_start_if_needed+0x2a7/0x490
  bpf_selem_unlink+0x24b/0x590
  bpf_task_storage_delete+0x3a/0x90
  bpf_prog_3b623b4be76cfb86_scx_pmu_task_fini+0x26/0x2a
  bpf_prog_4b1530d9d9852432_cosmos_exit_task+0x1d/0x1f
  bpf__sched_ext_ops_exit_task+0x4b/0xa7
  __scx_disable_and_exit_task+0x10a/0x200
  scx_disable_and_exit_task+0xe/0x60

Fix by deferring memory deallocation to ensure it occurs outside the raw
spinlock context.

Fixes: f484f4a3e058 ("bpf: Replace bpf memory allocator with kmalloc_nolock() in local storage")
Signed-off-by: Andrea Righi <arighi@nvidia.com>
---
 include/linux/bpf_local_storage.h |  1 +
 kernel/bpf/bpf_local_storage.c    | 96 +++++++++++++++++++++++++++++--
 2 files changed, 93 insertions(+), 4 deletions(-)

diff --git a/include/linux/bpf_local_storage.h b/include/linux/bpf_local_storage.h
index 8157e8da61d40..7e348a5c6b85d 100644
--- a/include/linux/bpf_local_storage.h
+++ b/include/linux/bpf_local_storage.h
@@ -105,6 +105,7 @@ struct bpf_local_storage {
 	u64 mem_charge;		/* Copy of mem charged to owner. Protected by "lock" */
 	refcount_t owner_refcnt;/* Used to pin owner when map_free is uncharging */
 	bool use_kmalloc_nolock;
+	struct hlist_node deferred_free_node; /* Used for deferred free */
 };
 
 /* U16_MAX is much more than enough for sk local storage
diff --git a/kernel/bpf/bpf_local_storage.c b/kernel/bpf/bpf_local_storage.c
index 9c96a4477f81a..0fbf6029e1361 100644
--- a/kernel/bpf/bpf_local_storage.c
+++ b/kernel/bpf/bpf_local_storage.c
@@ -14,9 +14,26 @@
 #include <linux/rcupdate.h>
 #include <linux/rcupdate_trace.h>
 #include <linux/rcupdate_wait.h>
+#include <linux/workqueue.h>
 
 #define BPF_LOCAL_STORAGE_CREATE_FLAG_MASK (BPF_F_NO_PREALLOC | BPF_F_CLONE)
 
+static DEFINE_PER_CPU(struct hlist_head, bpf_deferred_selem_free_list);
+static DEFINE_PER_CPU(struct hlist_head, bpf_deferred_storage_free_list);
+static DEFINE_PER_CPU(atomic_t, bpf_deferred_free_pending);
+
+struct bpf_deferred_free_rcu {
+	struct rcu_head rcu;
+	int cpu;
+};
+static DEFINE_PER_CPU(struct bpf_deferred_free_rcu, bpf_deferred_free_rcu);
+
+struct bpf_deferred_free_work {
+	struct work_struct work;
+	int cpu;
+};
+static DEFINE_PER_CPU(struct bpf_deferred_free_work, bpf_deferred_free_work);
+
 static struct bpf_local_storage_map_bucket *
 select_bucket(struct bpf_local_storage_map *smap,
 	      struct bpf_local_storage *local_storage)
@@ -260,6 +277,80 @@ static void bpf_selem_free_list(struct hlist_head *list, bool reuse_now)
 		bpf_selem_free(selem, reuse_now);
 }
 
+static void bpf_deferred_free_work_fn(struct work_struct *work)
+{
+	struct bpf_deferred_free_work *deferred_work =
+			container_of(work, struct bpf_deferred_free_work, work);
+	int cpu = deferred_work->cpu;
+	struct hlist_head *selem_list = per_cpu_ptr(&bpf_deferred_selem_free_list, cpu);
+	struct hlist_head *storage_list = per_cpu_ptr(&bpf_deferred_storage_free_list, cpu);
+	struct bpf_local_storage_elem *selem;
+	struct bpf_local_storage *local_storage;
+	struct hlist_node *n;
+
+	atomic_set(per_cpu_ptr(&bpf_deferred_free_pending, cpu), 0);
+
+	hlist_for_each_entry_safe(selem, n, selem_list, free_node) {
+		hlist_del_init(&selem->free_node);
+		bpf_selem_free(selem, true);
+	}
+
+	hlist_for_each_entry_safe(local_storage, n, storage_list, deferred_free_node) {
+		hlist_del_init(&local_storage->deferred_free_node);
+		bpf_local_storage_free(local_storage, true);
+	}
+}
+
+static void bpf_deferred_free_rcu_callback(struct rcu_head *rcu)
+{
+	struct bpf_deferred_free_rcu *deferred =
+			container_of(rcu, struct bpf_deferred_free_rcu, rcu);
+	int cpu = deferred->cpu;
+	struct bpf_deferred_free_work *work = per_cpu_ptr(&bpf_deferred_free_work, cpu);
+
+	work->cpu = cpu;
+	queue_work_on(cpu, system_wq, &work->work);
+}
+
+static void bpf_selem_unlink_defer_free(struct hlist_head *selem_free_list,
+					struct bpf_local_storage *local_storage,
+					bool free_local_storage)
+{
+	struct bpf_local_storage_elem *s;
+	struct hlist_node *n;
+	struct hlist_head *deferred_selem = this_cpu_ptr(&bpf_deferred_selem_free_list);
+	struct hlist_head *deferred_storage = this_cpu_ptr(&bpf_deferred_storage_free_list);
+	struct bpf_deferred_free_rcu *deferred_rcu = this_cpu_ptr(&bpf_deferred_free_rcu);
+
+	hlist_for_each_entry_safe(s, n, selem_free_list, free_node) {
+		hlist_del(&s->free_node);
+		hlist_add_head(&s->free_node, deferred_selem);
+	}
+
+	if (free_local_storage)
+		hlist_add_head(&local_storage->deferred_free_node, deferred_storage);
+
+	if (atomic_cmpxchg(this_cpu_ptr(&bpf_deferred_free_pending), 0, 1) == 0) {
+		deferred_rcu->cpu = smp_processor_id();
+		call_rcu(&deferred_rcu->rcu, bpf_deferred_free_rcu_callback);
+	}
+}
+
+static int __init bpf_local_storage_deferred_free_init(void)
+{
+	int cpu;
+
+	for_each_possible_cpu(cpu) {
+		INIT_HLIST_HEAD(per_cpu_ptr(&bpf_deferred_selem_free_list, cpu));
+		INIT_HLIST_HEAD(per_cpu_ptr(&bpf_deferred_storage_free_list, cpu));
+		atomic_set(per_cpu_ptr(&bpf_deferred_free_pending, cpu), 0);
+		INIT_WORK(&per_cpu(bpf_deferred_free_work, cpu).work,
+			  bpf_deferred_free_work_fn);
+	}
+	return 0;
+}
+subsys_initcall(bpf_local_storage_deferred_free_init);
+
 static void bpf_selem_unlink_storage_nolock_misc(struct bpf_local_storage_elem *selem,
 						 struct bpf_local_storage_map *smap,
 						 struct bpf_local_storage *local_storage,
@@ -419,10 +510,7 @@ int bpf_selem_unlink(struct bpf_local_storage_elem *selem)
 out:
 	raw_res_spin_unlock_irqrestore(&local_storage->lock, flags);
 
-	bpf_selem_free_list(&selem_free_list, false);
-
-	if (free_local_storage)
-		bpf_local_storage_free(local_storage, false);
+	bpf_selem_unlink_defer_free(&selem_free_list, local_storage, free_local_storage);
 
 	return err;
 }
-- 
2.53.0


^ permalink raw reply related	[flat|nested] 8+ messages in thread

end of thread, other threads:[~2026-03-17 18:57 UTC | newest]

Thread overview: 8+ messages (download: mbox.gz follow: Atom feed
-- links below jump to the message on this page --
2026-03-16 22:27 [PATCH] bpf: Always defer local storage free Andrea Righi
2026-03-16 23:07 ` bot+bpf-ci
2026-03-16 23:39 ` Kumar Kartikeya Dwivedi
2026-03-17  6:25   ` Andrea Righi
2026-03-17  8:15     ` Andrea Righi
2026-03-17 18:46       ` Cheng-Yang Chou
2026-03-17 18:53       ` Kumar Kartikeya Dwivedi
2026-03-17 18:57         ` Andrea Righi

This is a public inbox, see mirroring instructions
for how to clone and mirror all data and code used for this inbox