[PATCH] bpf: Always defer local storage free

public inbox for linux-kernel@vger.kernel.org
 help / color / mirror / Atom feed

* [PATCH] bpf: Always defer local storage free
@ 2026-03-16 22:27 Andrea Righi
  2026-03-16 23:07 ` bot+bpf-ci
  2026-03-16 23:39 ` Kumar Kartikeya Dwivedi
  0 siblings, 2 replies; 8+ messages in thread
From: Andrea Righi @ 2026-03-16 22:27 UTC (permalink / raw)
  To: Alexei Starovoitov, Daniel Borkmann, Andrii Nakryiko
  Cc: John Fastabend, Martin KaFai Lau, Eduard Zingerman, Song Liu,
	Yonghong Song, KP Singh, Stanislav Fomichev, Hao Luo, Jiri Olsa,
	Amery Hung, Tejun Heo, Emil Tsalapatis, bpf, sched-ext,
	linux-kernel

bpf_task_storage_delete() can be invoked from contexts that hold a raw
spinlock, such as sched_ext's ops.exit_task() callback, that is running
with the rq lock held.

The delete path eventually calls bpf_selem_unlink(), which frees the
element via bpf_selem_free_list() -> bpf_selem_free(). For task storage
with use_kmalloc_nolock, call_rcu_tasks_trace() is used, which is not
safe from raw spinlock context, triggering the following:

 =============================
 [ BUG: Invalid wait context ]
 7.0.0-rc1-virtme #1 Not tainted
 -----------------------------
 (udev-worker)/115 is trying to lock:
 ffffffffa6970dd0 (rcu_tasks_trace_srcu_struct_srcu_usage.lock){....}-{3:3}, at: spin_lock_irqsave_ssp_contention+0x54/0x90
 other info that might help us debug this:
 context-{5:5}
 3 locks held by (udev-worker)/115:
  #0: ffff8e16c634ce58 (&p->pi_lock){-.-.}-{2:2}, at: _task_rq_lock+0x2c/0x100
  #1: ffff8e16fbdbdae0 (&rq->__lock){-.-.}-{2:2}, at: raw_spin_rq_lock_nested+0x24/0xb0
  #2: ffffffffa6971b60 (rcu_read_lock){....}-{1:3}, at: __bpf_prog_enter+0x64/0x110
 ...
 Sched_ext: cosmos_1.0.7_g780e898fc_dirty_x86_64_unknown_linux_gnu (enabled+all), task: runnable_at=-2ms
 Call Trace:
  dump_stack_lvl+0x6f/0xb0
  __lock_acquire+0xf86/0x1de0
  lock_acquire+0xcf/0x310
  _raw_spin_lock_irqsave+0x39/0x60
  spin_lock_irqsave_ssp_contention+0x54/0x90
  srcu_gp_start_if_needed+0x2a7/0x490
  bpf_selem_unlink+0x24b/0x590
  bpf_task_storage_delete+0x3a/0x90
  bpf_prog_3b623b4be76cfb86_scx_pmu_task_fini+0x26/0x2a
  bpf_prog_4b1530d9d9852432_cosmos_exit_task+0x1d/0x1f
  bpf__sched_ext_ops_exit_task+0x4b/0xa7
  __scx_disable_and_exit_task+0x10a/0x200
  scx_disable_and_exit_task+0xe/0x60

Fix by deferring memory deallocation to ensure it occurs outside the raw
spinlock context.

Fixes: f484f4a3e058 ("bpf: Replace bpf memory allocator with kmalloc_nolock() in local storage")
Signed-off-by: Andrea Righi <arighi@nvidia.com>
---
 include/linux/bpf_local_storage.h |  1 +
 kernel/bpf/bpf_local_storage.c    | 96 +++++++++++++++++++++++++++++--
 2 files changed, 93 insertions(+), 4 deletions(-)

diff --git a/include/linux/bpf_local_storage.h b/include/linux/bpf_local_storage.h
index 8157e8da61d40..7e348a5c6b85d 100644
--- a/include/linux/bpf_local_storage.h
+++ b/include/linux/bpf_local_storage.h
@@ -105,6 +105,7 @@ struct bpf_local_storage {
 	u64 mem_charge;		/* Copy of mem charged to owner. Protected by "lock" */
 	refcount_t owner_refcnt;/* Used to pin owner when map_free is uncharging */
 	bool use_kmalloc_nolock;
+	struct hlist_node deferred_free_node; /* Used for deferred free */
 };
 
 /* U16_MAX is much more than enough for sk local storage
diff --git a/kernel/bpf/bpf_local_storage.c b/kernel/bpf/bpf_local_storage.c
index 9c96a4477f81a..0fbf6029e1361 100644
--- a/kernel/bpf/bpf_local_storage.c
+++ b/kernel/bpf/bpf_local_storage.c
@@ -14,9 +14,26 @@
 #include <linux/rcupdate.h>
 #include <linux/rcupdate_trace.h>
 #include <linux/rcupdate_wait.h>
+#include <linux/workqueue.h>
 
 #define BPF_LOCAL_STORAGE_CREATE_FLAG_MASK (BPF_F_NO_PREALLOC | BPF_F_CLONE)
 
+static DEFINE_PER_CPU(struct hlist_head, bpf_deferred_selem_free_list);
+static DEFINE_PER_CPU(struct hlist_head, bpf_deferred_storage_free_list);
+static DEFINE_PER_CPU(atomic_t, bpf_deferred_free_pending);
+
+struct bpf_deferred_free_rcu {
+	struct rcu_head rcu;
+	int cpu;
+};
+static DEFINE_PER_CPU(struct bpf_deferred_free_rcu, bpf_deferred_free_rcu);
+
+struct bpf_deferred_free_work {
+	struct work_struct work;
+	int cpu;
+};
+static DEFINE_PER_CPU(struct bpf_deferred_free_work, bpf_deferred_free_work);
+
 static struct bpf_local_storage_map_bucket *
 select_bucket(struct bpf_local_storage_map *smap,
 	      struct bpf_local_storage *local_storage)
@@ -260,6 +277,80 @@ static void bpf_selem_free_list(struct hlist_head *list, bool reuse_now)
 		bpf_selem_free(selem, reuse_now);
 }
 
+static void bpf_deferred_free_work_fn(struct work_struct *work)
+{
+	struct bpf_deferred_free_work *deferred_work =
+			container_of(work, struct bpf_deferred_free_work, work);
+	int cpu = deferred_work->cpu;
+	struct hlist_head *selem_list = per_cpu_ptr(&bpf_deferred_selem_free_list, cpu);
+	struct hlist_head *storage_list = per_cpu_ptr(&bpf_deferred_storage_free_list, cpu);
+	struct bpf_local_storage_elem *selem;
+	struct bpf_local_storage *local_storage;
+	struct hlist_node *n;
+
+	atomic_set(per_cpu_ptr(&bpf_deferred_free_pending, cpu), 0);
+
+	hlist_for_each_entry_safe(selem, n, selem_list, free_node) {
+		hlist_del_init(&selem->free_node);
+		bpf_selem_free(selem, true);
+	}
+
+	hlist_for_each_entry_safe(local_storage, n, storage_list, deferred_free_node) {
+		hlist_del_init(&local_storage->deferred_free_node);
+		bpf_local_storage_free(local_storage, true);
+	}
+}
+
+static void bpf_deferred_free_rcu_callback(struct rcu_head *rcu)
+{
+	struct bpf_deferred_free_rcu *deferred =
+			container_of(rcu, struct bpf_deferred_free_rcu, rcu);
+	int cpu = deferred->cpu;
+	struct bpf_deferred_free_work *work = per_cpu_ptr(&bpf_deferred_free_work, cpu);
+
+	work->cpu = cpu;
+	queue_work_on(cpu, system_wq, &work->work);
+}
+
+static void bpf_selem_unlink_defer_free(struct hlist_head *selem_free_list,
+					struct bpf_local_storage *local_storage,
+					bool free_local_storage)
+{
+	struct bpf_local_storage_elem *s;
+	struct hlist_node *n;
+	struct hlist_head *deferred_selem = this_cpu_ptr(&bpf_deferred_selem_free_list);
+	struct hlist_head *deferred_storage = this_cpu_ptr(&bpf_deferred_storage_free_list);
+	struct bpf_deferred_free_rcu *deferred_rcu = this_cpu_ptr(&bpf_deferred_free_rcu);
+
+	hlist_for_each_entry_safe(s, n, selem_free_list, free_node) {
+		hlist_del(&s->free_node);
+		hlist_add_head(&s->free_node, deferred_selem);
+	}
+
+	if (free_local_storage)
+		hlist_add_head(&local_storage->deferred_free_node, deferred_storage);
+
+	if (atomic_cmpxchg(this_cpu_ptr(&bpf_deferred_free_pending), 0, 1) == 0) {
+		deferred_rcu->cpu = smp_processor_id();
+		call_rcu(&deferred_rcu->rcu, bpf_deferred_free_rcu_callback);
+	}
+}
+
+static int __init bpf_local_storage_deferred_free_init(void)
+{
+	int cpu;
+
+	for_each_possible_cpu(cpu) {
+		INIT_HLIST_HEAD(per_cpu_ptr(&bpf_deferred_selem_free_list, cpu));
+		INIT_HLIST_HEAD(per_cpu_ptr(&bpf_deferred_storage_free_list, cpu));
+		atomic_set(per_cpu_ptr(&bpf_deferred_free_pending, cpu), 0);
+		INIT_WORK(&per_cpu(bpf_deferred_free_work, cpu).work,
+			  bpf_deferred_free_work_fn);
+	}
+	return 0;
+}
+subsys_initcall(bpf_local_storage_deferred_free_init);
+
 static void bpf_selem_unlink_storage_nolock_misc(struct bpf_local_storage_elem *selem,
 						 struct bpf_local_storage_map *smap,
 						 struct bpf_local_storage *local_storage,
@@ -419,10 +510,7 @@ int bpf_selem_unlink(struct bpf_local_storage_elem *selem)
 out:
 	raw_res_spin_unlock_irqrestore(&local_storage->lock, flags);
 
-	bpf_selem_free_list(&selem_free_list, false);
-
-	if (free_local_storage)
-		bpf_local_storage_free(local_storage, false);
+	bpf_selem_unlink_defer_free(&selem_free_list, local_storage, free_local_storage);
 
 	return err;
 }
-- 
2.53.0


^ permalink raw reply related	[flat|nested] 8+ messages in thread

* Re: [PATCH] bpf: Always defer local storage free
  2026-03-16 22:27 [PATCH] bpf: Always defer local storage free Andrea Righi
@ 2026-03-16 23:07 ` bot+bpf-ci
  2026-03-16 23:39 ` Kumar Kartikeya Dwivedi
  1 sibling, 0 replies; 8+ messages in thread
From: bot+bpf-ci @ 2026-03-16 23:07 UTC (permalink / raw)
  To: arighi, ast, daniel, andrii
  Cc: john.fastabend, martin.lau, eddyz87, song, yonghong.song, kpsingh,
	sdf, haoluo, jolsa, ameryhung, tj, emil, bpf, sched-ext,
	linux-kernel, ast, andrii, daniel, martin.lau, eddyz87,
	yonghong.song, clm, ihor.solodrai

[-- Attachment #1: Type: text/plain, Size: 4850 bytes --]

> diff --git a/kernel/bpf/bpf_local_storage.c b/kernel/bpf/bpf_local_storage.c
> --- a/kernel/bpf/bpf_local_storage.c
> +++ b/kernel/bpf/bpf_local_storage.c

[ ... ]

> +static void bpf_selem_unlink_defer_free(struct hlist_head *selem_free_list,
> +					struct bpf_local_storage *local_storage,
> +					bool free_local_storage)
> +{
> +	struct bpf_local_storage_elem *s;
> +	struct hlist_node *n;
> +	struct hlist_head *deferred_selem = this_cpu_ptr(&bpf_deferred_selem_free_list);
> +	struct hlist_head *deferred_storage = this_cpu_ptr(&bpf_deferred_storage_free_list);
> +	struct bpf_deferred_free_rcu *deferred_rcu = this_cpu_ptr(&bpf_deferred_free_rcu);
> +
> +	hlist_for_each_entry_safe(s, n, selem_free_list, free_node) {
> +		hlist_del(&s->free_node);
> +		hlist_add_head(&s->free_node, deferred_selem);
> +	}
> +
> +	if (free_local_storage)
> +		hlist_add_head(&local_storage->deferred_free_node, deferred_storage);

Can these per-CPU lists be corrupted by concurrent access?

This function is called from bpf_selem_unlink() after releasing
local_storage->lock with raw_res_spin_unlock_irqrestore(), so IRQs
are restored. The BPF program context provides migrate_disable()
(keeping the CPU stable for this_cpu_ptr), but IRQs are not masked.

A BPF tracing program (kprobe, perf_event) running at hardirq or
NMI level can interrupt this function mid-hlist_add_head() and call
bpf_selem_unlink() -> bpf_selem_unlink_defer_free() on the same
CPU, modifying the same per-CPU list concurrently. hlist_add_head()
is a multi-step, non-atomic operation, so this interleaving can
lose list entries or corrupt the list pointers.

The same race exists between bpf_deferred_free_work_fn() (running
in process context on the workqueue) and
bpf_selem_unlink_defer_free() called from any BPF program that
interrupts it.

Would something like an llist (lock-free linked list that is safe
against concurrent producers) or local_irq_save/restore around
the list operations be needed here?

> +
> +	if (atomic_cmpxchg(this_cpu_ptr(&bpf_deferred_free_pending), 0, 1) == 0) {
> +		deferred_rcu->cpu = smp_processor_id();
> +		call_rcu(&deferred_rcu->rcu, bpf_deferred_free_rcu_callback);
> +	}
> +}

[ ... ]

> +static void bpf_deferred_free_work_fn(struct work_struct *work)
> +{
> +	struct bpf_deferred_free_work *deferred_work =
> +			container_of(work, struct bpf_deferred_free_work, work);
> +	int cpu = deferred_work->cpu;
> +	struct hlist_head *selem_list = per_cpu_ptr(&bpf_deferred_selem_free_list, cpu);
> +	struct hlist_head *storage_list = per_cpu_ptr(&bpf_deferred_storage_free_list, cpu);
> +	struct bpf_local_storage_elem *selem;
> +	struct bpf_local_storage *local_storage;
> +	struct hlist_node *n;
> +
> +	atomic_set(per_cpu_ptr(&bpf_deferred_free_pending, cpu), 0);
> +
> +	hlist_for_each_entry_safe(selem, n, selem_list, free_node) {
> +		hlist_del_init(&selem->free_node);
> +		bpf_selem_free(selem, true);
> +	}
> +
> +	hlist_for_each_entry_safe(local_storage, n, storage_list, deferred_free_node) {
> +		hlist_del_init(&local_storage->deferred_free_node);
> +		bpf_local_storage_free(local_storage, true);
> +	}
> +}

Does passing reuse_now=true here change the RCU synchronization
domain?

Previously, bpf_selem_unlink() called bpf_selem_free(selem, false),
which for use_kmalloc_nolock elements used
call_rcu_tasks_trace() to wait for a Tasks Trace RCU grace period
before freeing. This is needed because sleepable BPF programs hold
rcu_read_lock_trace() (see __bpf_prog_enter_sleepable()) and can
retain pointers to sdata across sleep points.

With this change, the work function calls bpf_selem_free(selem, true),
which uses regular call_rcu() instead. The full chain becomes:

  bpf_selem_unlink_defer_free()
    -> call_rcu()        [regular RCU grace period]
      -> queue_work_on()
        -> bpf_selem_free(selem, true)
          -> call_rcu()  [regular RCU grace period]
            -> kfree_nolock()

Two regular RCU grace periods do not substitute for one Tasks Trace
RCU grace period -- they are different synchronization domains. A
sleepable BPF program sleeping inside rcu_read_lock_trace() will
not block regular call_rcu() callbacks.

The reuse_now=true path (regular call_rcu) was previously only used
from bpf_selem_unlink_nofail() in the map_free/destroy paths, where
synchronize_rcu_mult(call_rcu, call_rcu_tasks_trace) had already been
called to cover both domains. In this new usage, no Tasks Trace RCU
synchronization occurs before the free.

The same applies to bpf_local_storage_free(local_storage, true)
which similarly changes from call_rcu_tasks_trace() to call_rcu().

---
AI reviewed your patch. Please fix the bug or email reply why it's not a bug.
See: https://github.com/kernel-patches/vmtest/blob/master/ci/claude/README.md

CI run summary: https://github.com/kernel-patches/bpf/actions/runs/23169312812

^ permalink raw reply	[flat|nested] 8+ messages in thread

* Re: [PATCH] bpf: Always defer local storage free
  2026-03-16 22:27 [PATCH] bpf: Always defer local storage free Andrea Righi
  2026-03-16 23:07 ` bot+bpf-ci
@ 2026-03-16 23:39 ` Kumar Kartikeya Dwivedi
  2026-03-17  6:25   ` Andrea Righi
  1 sibling, 1 reply; 8+ messages in thread
From: Kumar Kartikeya Dwivedi @ 2026-03-16 23:39 UTC (permalink / raw)
  To: Andrea Righi, Paul E . McKenney
  Cc: Alexei Starovoitov, Daniel Borkmann, Andrii Nakryiko,
	John Fastabend, Martin KaFai Lau, Eduard Zingerman, Song Liu,
	Yonghong Song, KP Singh, Stanislav Fomichev, Hao Luo, Jiri Olsa,
	Amery Hung, Tejun Heo, Emil Tsalapatis, bpf, sched-ext,
	linux-kernel

On Mon, 16 Mar 2026 at 23:28, Andrea Righi <arighi@nvidia.com> wrote:
>
> bpf_task_storage_delete() can be invoked from contexts that hold a raw
> spinlock, such as sched_ext's ops.exit_task() callback, that is running
> with the rq lock held.
>
> The delete path eventually calls bpf_selem_unlink(), which frees the
> element via bpf_selem_free_list() -> bpf_selem_free(). For task storage
> with use_kmalloc_nolock, call_rcu_tasks_trace() is used, which is not
> safe from raw spinlock context, triggering the following:
>

Paul posted [0] to fix it in SRCU. It was always safe to
call_rcu_tasks_trace() under raw spin lock, but became problematic on
RT with the recent conversion that uses SRCU underneath, please give
[0] a spin. While I couldn't reproduce the warning using scx_cosmos, I
verified that it goes away for me when calling the path from atomic
context.

  [0]: https://lore.kernel.org/rcu/841c8a0b-0f50-4617-98b2-76523e13b910@paulmck-laptop

>  =============================
>  [ BUG: Invalid wait context ]
>  7.0.0-rc1-virtme #1 Not tainted
>  -----------------------------
>  (udev-worker)/115 is trying to lock:
>  ffffffffa6970dd0 (rcu_tasks_trace_srcu_struct_srcu_usage.lock){....}-{3:3}, at: spin_lock_irqsave_ssp_contention+0x54/0x90
>  other info that might help us debug this:
>  context-{5:5}
>  3 locks held by (udev-worker)/115:
>   #0: ffff8e16c634ce58 (&p->pi_lock){-.-.}-{2:2}, at: _task_rq_lock+0x2c/0x100
>   #1: ffff8e16fbdbdae0 (&rq->__lock){-.-.}-{2:2}, at: raw_spin_rq_lock_nested+0x24/0xb0
>   #2: ffffffffa6971b60 (rcu_read_lock){....}-{1:3}, at: __bpf_prog_enter+0x64/0x110
>  ...
>  Sched_ext: cosmos_1.0.7_g780e898fc_dirty_x86_64_unknown_linux_gnu (enabled+all), task: runnable_at=-2ms
>  Call Trace:
>   dump_stack_lvl+0x6f/0xb0
>   __lock_acquire+0xf86/0x1de0
>   lock_acquire+0xcf/0x310
>   _raw_spin_lock_irqsave+0x39/0x60
>   spin_lock_irqsave_ssp_contention+0x54/0x90
>   srcu_gp_start_if_needed+0x2a7/0x490
>   bpf_selem_unlink+0x24b/0x590
>   bpf_task_storage_delete+0x3a/0x90
>   bpf_prog_3b623b4be76cfb86_scx_pmu_task_fini+0x26/0x2a
>   bpf_prog_4b1530d9d9852432_cosmos_exit_task+0x1d/0x1f
>   bpf__sched_ext_ops_exit_task+0x4b/0xa7
>   __scx_disable_and_exit_task+0x10a/0x200
>   scx_disable_and_exit_task+0xe/0x60
>
> Fix by deferring memory deallocation to ensure it occurs outside the raw
> spinlock context.
>
> Fixes: f484f4a3e058 ("bpf: Replace bpf memory allocator with kmalloc_nolock() in local storage")
> Signed-off-by: Andrea Righi <arighi@nvidia.com>
> ---
> [...]

^ permalink raw reply	[flat|nested] 8+ messages in thread

* Re: [PATCH] bpf: Always defer local storage free
  2026-03-16 23:39 ` Kumar Kartikeya Dwivedi
@ 2026-03-17  6:25   ` Andrea Righi
  2026-03-17  8:15     ` Andrea Righi
  0 siblings, 1 reply; 8+ messages in thread
From: Andrea Righi @ 2026-03-17  6:25 UTC (permalink / raw)
  To: Kumar Kartikeya Dwivedi
  Cc: Paul E . McKenney, Alexei Starovoitov, Daniel Borkmann,
	Andrii Nakryiko, John Fastabend, Martin KaFai Lau,
	Eduard Zingerman, Song Liu, Yonghong Song, KP Singh,
	Stanislav Fomichev, Hao Luo, Jiri Olsa, Amery Hung, Tejun Heo,
	Emil Tsalapatis, bpf, sched-ext, linux-kernel

Hi Kumar,

On Tue, Mar 17, 2026 at 12:39:00AM +0100, Kumar Kartikeya Dwivedi wrote:
> On Mon, 16 Mar 2026 at 23:28, Andrea Righi <arighi@nvidia.com> wrote:
> >
> > bpf_task_storage_delete() can be invoked from contexts that hold a raw
> > spinlock, such as sched_ext's ops.exit_task() callback, that is running
> > with the rq lock held.
> >
> > The delete path eventually calls bpf_selem_unlink(), which frees the
> > element via bpf_selem_free_list() -> bpf_selem_free(). For task storage
> > with use_kmalloc_nolock, call_rcu_tasks_trace() is used, which is not
> > safe from raw spinlock context, triggering the following:
> >
> 
> Paul posted [0] to fix it in SRCU. It was always safe to
> call_rcu_tasks_trace() under raw spin lock, but became problematic on
> RT with the recent conversion that uses SRCU underneath, please give
> [0] a spin. While I couldn't reproduce the warning using scx_cosmos, I
> verified that it goes away for me when calling the path from atomic
> context.
> 
>   [0]: https://lore.kernel.org/rcu/841c8a0b-0f50-4617-98b2-76523e13b910@paulmck-laptop

With this applied I get the following:

[   26.986798] ======================================================
[   26.986883] WARNING: possible circular locking dependency detected
[   26.986957] 7.0.0-rc4-virtme #15 Not tainted
[   26.987020] ------------------------------------------------------
[   26.987094] schbench/532 is trying to acquire lock:
[   26.987155] ffffffff9cd70d90 (rcu_tasks_trace_srcu_struct_srcu_usage.lock){....}-{2:2}, at: raw_spin_lock_irqsave_sdp_contention+0x5b/0xe0
[   26.987313]
[   26.987313] but task is already holding lock:
[   26.987394] ffff8df7fb9bdae0 (&rq->__lock){-.-.}-{2:2}, at: raw_spin_rq_lock_nested+0x24/0xb0
[   26.987512]
[   26.987512] which lock already depends on the new lock.
[   26.987512]
[   26.987598]
[   26.987598] the existing dependency chain (in reverse order) is:
[   26.987704]
[   26.987704] -> #3 (&rq->__lock){-.-.}-{2:2}:
[   26.987779]        lock_acquire+0xcf/0x310
[   26.987844]        _raw_spin_lock_nested+0x2e/0x40
[   26.987911]        raw_spin_rq_lock_nested+0x24/0xb0
[   26.987973]        ___task_rq_lock+0x42/0x110
[   26.988034]        wake_up_new_task+0x198/0x440
[   26.988099]        kernel_clone+0x118/0x3c0
[   26.988149]        user_mode_thread+0x61/0x90
[   26.988222]        rest_init+0x1e/0x160
[   26.988272]        start_kernel+0x7a2/0x7b0
[   26.988329]        x86_64_start_reservations+0x24/0x30
[   26.988392]        x86_64_start_kernel+0xd1/0xe0
[   26.988451]        common_startup_64+0x13e/0x148
[   26.988523]
[   26.988523] -> #2 (&p->pi_lock){-.-.}-{2:2}:
[   26.988598]        lock_acquire+0xcf/0x310
[   26.988650]        _raw_spin_lock_irqsave+0x39/0x60
[   26.988718]        try_to_wake_up+0x57/0xbb0
[   26.988779]        create_worker+0x17e/0x200
[   26.988839]        workqueue_init+0x28d/0x300
[   26.988902]        kernel_init_freeable+0x134/0x2b0
[   26.988964]        kernel_init+0x1a/0x130
[   26.989016]        ret_from_fork+0x2bd/0x370
[   26.989079]        ret_from_fork_asm+0x1a/0x30
[   26.989143]
[   26.989143] -> #1 (&pool->lock){-.-.}-{2:2}:
[   26.989217]        lock_acquire+0xcf/0x310
[   26.989263]        _raw_spin_lock+0x30/0x40
[   26.989315]        __queue_work+0xdb/0x6d0
[   26.989367]        queue_delayed_work_on+0xc7/0xe0
[   26.989427]        srcu_gp_start_if_needed+0x3cc/0x540
[   26.989507]        __synchronize_srcu+0xf6/0x1b0
[   26.989567]        rcu_init_tasks_generic+0xfe/0x120
[   26.989626]        do_one_initcall+0x6f/0x300
[   26.989691]        kernel_init_freeable+0x24b/0x2b0
[   26.989750]        kernel_init+0x1a/0x130
[   26.989797]        ret_from_fork+0x2bd/0x370
[   26.989857]        ret_from_fork_asm+0x1a/0x30
[   26.989916]
[   26.989916] -> #0 (rcu_tasks_trace_srcu_struct_srcu_usage.lock){....}-{2:2}:
[   26.990015]        check_prev_add+0xe1/0xd30
[   26.990076]        __lock_acquire+0x1561/0x1de0
[   26.990137]        lock_acquire+0xcf/0x310
[   26.990182]        _raw_spin_lock_irqsave+0x39/0x60
[   26.990240]        raw_spin_lock_irqsave_sdp_contention+0x5b/0xe0
[   26.990312]        srcu_gp_start_if_needed+0x92/0x540
[   26.990370]        bpf_selem_unlink+0x267/0x5c0
[   26.990430]        bpf_task_storage_delete+0x3a/0x90
[   26.990495]        bpf_prog_134dba630b11d3b7_scx_pmu_task_fini+0x26/0x2a
[   26.990566]        bpf_prog_4b1530d9d9852432_cosmos_exit_task+0x1d/0x1f
[   26.990636]        bpf__sched_ext_ops_exit_task+0x4b/0xa7
[   26.990694]        scx_exit_task+0x17a/0x230
[   26.990753]        sched_ext_dead+0xb2/0x120
[   26.990811]        finish_task_switch.isra.0+0x305/0x370
[   26.990870]        __schedule+0x576/0x1d60
[   26.990917]        schedule+0x3a/0x130
[   26.990962]        futex_do_wait+0x4a/0xa0
[   26.991008]        __futex_wait+0x8e/0xf0
[   26.991054]        futex_wait+0x78/0x120
[   26.991099]        do_futex+0xc5/0x190
[   26.991144]        __x64_sys_futex+0x12d/0x220
[   26.991202]        do_syscall_64+0x117/0xf80
[   26.991260]        entry_SYSCALL_64_after_hwframe+0x77/0x7f
[   26.991318]
[   26.991318] other info that might help us debug this:
[   26.991318]
[   26.991400] Chain exists of:
[   26.991400]   rcu_tasks_trace_srcu_struct_srcu_usage.lock --> &p->pi_lock --> &rq->__lock
[   26.991400]
[   26.991524]  Possible unsafe locking scenario:
[   26.991524]
[   26.991592]        CPU0                    CPU1
[   26.991647]        ----                    ----
[   26.991702]   lock(&rq->__lock);
[   26.991747]                                lock(&p->pi_lock);
[   26.991816]                                lock(&rq->__lock);
[   26.991884]   lock(rcu_tasks_trace_srcu_struct_srcu_usage.lock);
[   26.991953]
[   26.991953]  *** DEADLOCK ***
[   26.991953]
[   26.992021] 3 locks held by schbench/532:
[   26.992065]  #0: ffff8df7cc154f18 (&p->pi_lock){-.-.}-{2:2}, at: _task_rq_lock+0x2c/0x100
[   26.992151]  #1: ffff8df7fb9bdae0 (&rq->__lock){-.-.}-{2:2}, at: raw_spin_rq_lock_nested+0x24/0xb0
[   26.992250]  #2: ffffffff9cd71b20 (rcu_read_lock){....}-{1:3}, at: __bpf_prog_enter+0x64/0x110
[   26.992348]
[   26.992348] stack backtrace:
[   26.992406] CPU: 7 UID: 1000 PID: 532 Comm: schbench Not tainted 7.0.0-rc4-virtme #15 PREEMPT(full)
[   26.992409] Hardware name: Bochs Bochs, BIOS Bochs 01/01/2011
[   26.992411] Sched_ext: cosmos_1.1.0_g0949d453c_x86_64_unknown^_linux_gnu (enabled+all), task: runnable_at=+0ms
[   26.992412] Call Trace:
[   26.992414]  C<TASK>
[   26.992415]  dump_stack_lvl+0x6f/0xb0
[   26.992418]  print_circular_bug.cold+0x18b/0x1d6
[   26.992422]  check_noncircular+0x165/0x190
[   26.992425]  check_prev_add+0xe1/0xd30
[   26.992428]  __lock_acquire+0x1561/0x1de0
[   26.992430]  lock_acquire+0xcf/0x310
[   26.992431]  ? raw_spin_lock_irqsave_sdp_contention+0x5b/0xe0
[   26.992434]  _raw_spin_lock_irqsave+0x39/0x60
[   26.992435]  ? raw_spin_lock_irqsave_sdp_contention+0x5b/0xe0
[   26.992437]  raw_spin_lock_irqsave_sdp_contention+0x5b/0xe0
[   26.992439]  srcu_gp_start_if_needed+0x92/0x540
[   26.992441]  bpf_selem_unlink+0x267/0x5c0
[   26.992443]  bpf_task_storage_delete+0x3a/0x90
[   26.992445]  bpf_prog_134dba630b11d3b7_scx_pmu_task_fini+0x26/0x2a
[   26.992447]  bpf_prog_4b1530d9d9852432_cosmos_exit_task+0x1d/0x1f
[   26.992448]  bpf__sched_ext_ops_exit_task+0x4b/0xa7
[   26.992449]  scx_exit_task+0x17a/0x230
[   26.992451]  sched_ext_dead+0xb2/0x120
[   26.992453]  finish_task_switch.isra.0+0x305/0x370
[   26.992455]  __schedule+0x576/0x1d60
[   26.992457]  ? find_held_lock+0x2b/0x80
[   26.992460]  schedule+0x3a/0x130
[   26.992462]  futex_do_wait+0x4a/0xa0
[   26.992463]  __futex_wait+0x8e/0xf0
[   26.992465]  ? __pfx_futex_wake_mark+0x10/0x10
[   26.992468]  futex_wait+0x78/0x120
[   26.992469]  ? find_held_lock+0x2b/0x80
[   26.992472]  do_futex+0xc5/0x190
[   26.992473]  __x64_sys_futex+0x12d/0x220
[   26.992474]  ? restore_fpregs_from_fpstate+0x48/0xd0
[   26.992477]  do_syscall_64+0x117/0xf80
[   26.992478]  ? __irq_exit_rcu+0x38/0xc0
[   26.992481]  entry_SYSCALL_64_after_hwframe+0x77/0x7f
[   26.992482] RIP: 0033:0x7fe20e52eb1d

I can easily reproduce this (or the previous one) inside virtme-ng:

$ cat << EOF > /tmp/config
CONFIG_BPF=y
CONFIG_BPF_SYSCALL=y
CONFIG_BPF_JIT=y
CONFIG_DEBUG_INFO_BTF=y
CONFIG_BPF_JIT_ALWAYS_ON=y
CONFIG_BPF_JIT_DEFAULT_ON=y
CONFIG_SCHED_CLASS_EXT=y
CONFIG_KALLSYMS_ALL=y
CONFIG_FUNCTION_TRACER=y
CONFIG_SCHED_DEBUG=y
CONFIG_SCHED_AUTOGROUP=y
CONFIG_SCHED_CORE=y
CONFIG_SCHED_MC=y
CONFIG_PREEMPT=y
CONFIG_PREEMPT_DYNAMIC=y
CONFIG_DEBUG_LOCKDEP=y
CONFIG_DEBUG_ATOMIC_SLEEP=y
CONFIG_PROVE_LOCKING=y
CONFIG_BPF_EVENTS=y
CONFIG_FTRACE_SYSCALLS=y
CONFIG_DYNAMIC_FTRACE=y
CONFIG_KPROBES=y
CONFIG_KPROBE_EVENTS=y
CONFIG_UPROBES=y
CONFIG_UPROBE_EVENTS=y
CONFIG_DEBUG_FS=y
CONFIG_IKHEADERS=y
CONFIG_IKCONFIG_PROC=y
CONFIG_IKCONFIG=y
CONFIG_SCHED_CLASS_EXT=y
CONFIG_CGROUPS=y
CONFIG_CGROUP_SCHED=y
CONFIG_EXT_GROUP_SCHED=y
CONFIG_BPF=y
CONFIG_BPF_SYSCALL=y
CONFIG_DEBUG_INFO=y
CONFIG_DEBUG_INFO_BTF=y
EOF
$ vng -vb --config /tmp/config
$ vng -v -- "scx_cosmos & schbench -L -m 4 -t 48 -n 0"

Thanks,
-Andrea

> 
> >  =============================
> >  [ BUG: Invalid wait context ]
> >  7.0.0-rc1-virtme #1 Not tainted
> >  -----------------------------
> >  (udev-worker)/115 is trying to lock:
> >  ffffffffa6970dd0 (rcu_tasks_trace_srcu_struct_srcu_usage.lock){....}-{3:3}, at: spin_lock_irqsave_ssp_contention+0x54/0x90
> >  other info that might help us debug this:
> >  context-{5:5}
> >  3 locks held by (udev-worker)/115:
> >   #0: ffff8e16c634ce58 (&p->pi_lock){-.-.}-{2:2}, at: _task_rq_lock+0x2c/0x100
> >   #1: ffff8e16fbdbdae0 (&rq->__lock){-.-.}-{2:2}, at: raw_spin_rq_lock_nested+0x24/0xb0
> >   #2: ffffffffa6971b60 (rcu_read_lock){....}-{1:3}, at: __bpf_prog_enter+0x64/0x110
> >  ...
> >  Sched_ext: cosmos_1.0.7_g780e898fc_dirty_x86_64_unknown_linux_gnu (enabled+all), task: runnable_at=-2ms
> >  Call Trace:
> >   dump_stack_lvl+0x6f/0xb0
> >   __lock_acquire+0xf86/0x1de0
> >   lock_acquire+0xcf/0x310
> >   _raw_spin_lock_irqsave+0x39/0x60
> >   spin_lock_irqsave_ssp_contention+0x54/0x90
> >   srcu_gp_start_if_needed+0x2a7/0x490
> >   bpf_selem_unlink+0x24b/0x590
> >   bpf_task_storage_delete+0x3a/0x90
> >   bpf_prog_3b623b4be76cfb86_scx_pmu_task_fini+0x26/0x2a
> >   bpf_prog_4b1530d9d9852432_cosmos_exit_task+0x1d/0x1f
> >   bpf__sched_ext_ops_exit_task+0x4b/0xa7
> >   __scx_disable_and_exit_task+0x10a/0x200
> >   scx_disable_and_exit_task+0xe/0x60
> >
> > Fix by deferring memory deallocation to ensure it occurs outside the raw
> > spinlock context.
> >
> > Fixes: f484f4a3e058 ("bpf: Replace bpf memory allocator with kmalloc_nolock() in local storage")
> > Signed-off-by: Andrea Righi <arighi@nvidia.com>
> > ---
> > [...]

^ permalink raw reply	[flat|nested] 8+ messages in thread

* Re: [PATCH] bpf: Always defer local storage free
  2026-03-17  6:25   ` Andrea Righi
@ 2026-03-17  8:15     ` Andrea Righi
  2026-03-17 18:46       ` Cheng-Yang Chou
  2026-03-17 18:53       ` Kumar Kartikeya Dwivedi
  0 siblings, 2 replies; 8+ messages in thread
From: Andrea Righi @ 2026-03-17  8:15 UTC (permalink / raw)
  To: Kumar Kartikeya Dwivedi
  Cc: Paul E . McKenney, Alexei Starovoitov, Daniel Borkmann,
	Andrii Nakryiko, John Fastabend, Martin KaFai Lau,
	Eduard Zingerman, Song Liu, Yonghong Song, KP Singh,
	Stanislav Fomichev, Hao Luo, Jiri Olsa, Amery Hung, Tejun Heo,
	Emil Tsalapatis, bpf, sched-ext, linux-kernel

On Tue, Mar 17, 2026 at 07:25:18AM +0100, Andrea Righi wrote:
> Hi Kumar,
> 
> On Tue, Mar 17, 2026 at 12:39:00AM +0100, Kumar Kartikeya Dwivedi wrote:
> > On Mon, 16 Mar 2026 at 23:28, Andrea Righi <arighi@nvidia.com> wrote:
> > >
> > > bpf_task_storage_delete() can be invoked from contexts that hold a raw
> > > spinlock, such as sched_ext's ops.exit_task() callback, that is running
> > > with the rq lock held.
> > >
> > > The delete path eventually calls bpf_selem_unlink(), which frees the
> > > element via bpf_selem_free_list() -> bpf_selem_free(). For task storage
> > > with use_kmalloc_nolock, call_rcu_tasks_trace() is used, which is not
> > > safe from raw spinlock context, triggering the following:
> > >
> > 
> > Paul posted [0] to fix it in SRCU. It was always safe to
> > call_rcu_tasks_trace() under raw spin lock, but became problematic on
> > RT with the recent conversion that uses SRCU underneath, please give
> > [0] a spin. While I couldn't reproduce the warning using scx_cosmos, I
> > verified that it goes away for me when calling the path from atomic
> > context.
> > 
> >   [0]: https://lore.kernel.org/rcu/841c8a0b-0f50-4617-98b2-76523e13b910@paulmck-laptop
> 
> With this applied I get the following:
> 
> [   26.986798] ======================================================
> [   26.986883] WARNING: possible circular locking dependency detected
> [   26.986957] 7.0.0-rc4-virtme #15 Not tainted
> [   26.987020] ------------------------------------------------------
> [   26.987094] schbench/532 is trying to acquire lock:
> [   26.987155] ffffffff9cd70d90 (rcu_tasks_trace_srcu_struct_srcu_usage.lock){....}-{2:2}, at: raw_spin_lock_irqsave_sdp_contention+0x5b/0xe0
> [   26.987313]
> [   26.987313] but task is already holding lock:
> [   26.987394] ffff8df7fb9bdae0 (&rq->__lock){-.-.}-{2:2}, at: raw_spin_rq_lock_nested+0x24/0xb0
> [   26.987512]
> [   26.987512] which lock already depends on the new lock.
> [   26.987512]
> [   26.987598]
> [   26.987598] the existing dependency chain (in reverse order) is:
> [   26.987704]
> [   26.987704] -> #3 (&rq->__lock){-.-.}-{2:2}:
> [   26.987779]        lock_acquire+0xcf/0x310
> [   26.987844]        _raw_spin_lock_nested+0x2e/0x40
> [   26.987911]        raw_spin_rq_lock_nested+0x24/0xb0
> [   26.987973]        ___task_rq_lock+0x42/0x110
> [   26.988034]        wake_up_new_task+0x198/0x440
> [   26.988099]        kernel_clone+0x118/0x3c0
> [   26.988149]        user_mode_thread+0x61/0x90
> [   26.988222]        rest_init+0x1e/0x160
> [   26.988272]        start_kernel+0x7a2/0x7b0
> [   26.988329]        x86_64_start_reservations+0x24/0x30
> [   26.988392]        x86_64_start_kernel+0xd1/0xe0
> [   26.988451]        common_startup_64+0x13e/0x148
> [   26.988523]
> [   26.988523] -> #2 (&p->pi_lock){-.-.}-{2:2}:
> [   26.988598]        lock_acquire+0xcf/0x310
> [   26.988650]        _raw_spin_lock_irqsave+0x39/0x60
> [   26.988718]        try_to_wake_up+0x57/0xbb0
> [   26.988779]        create_worker+0x17e/0x200
> [   26.988839]        workqueue_init+0x28d/0x300
> [   26.988902]        kernel_init_freeable+0x134/0x2b0
> [   26.988964]        kernel_init+0x1a/0x130
> [   26.989016]        ret_from_fork+0x2bd/0x370
> [   26.989079]        ret_from_fork_asm+0x1a/0x30
> [   26.989143]
> [   26.989143] -> #1 (&pool->lock){-.-.}-{2:2}:
> [   26.989217]        lock_acquire+0xcf/0x310
> [   26.989263]        _raw_spin_lock+0x30/0x40
> [   26.989315]        __queue_work+0xdb/0x6d0
> [   26.989367]        queue_delayed_work_on+0xc7/0xe0
> [   26.989427]        srcu_gp_start_if_needed+0x3cc/0x540
> [   26.989507]        __synchronize_srcu+0xf6/0x1b0
> [   26.989567]        rcu_init_tasks_generic+0xfe/0x120
> [   26.989626]        do_one_initcall+0x6f/0x300
> [   26.989691]        kernel_init_freeable+0x24b/0x2b0
> [   26.989750]        kernel_init+0x1a/0x130
> [   26.989797]        ret_from_fork+0x2bd/0x370
> [   26.989857]        ret_from_fork_asm+0x1a/0x30
> [   26.989916]
> [   26.989916] -> #0 (rcu_tasks_trace_srcu_struct_srcu_usage.lock){....}-{2:2}:
> [   26.990015]        check_prev_add+0xe1/0xd30
> [   26.990076]        __lock_acquire+0x1561/0x1de0
> [   26.990137]        lock_acquire+0xcf/0x310
> [   26.990182]        _raw_spin_lock_irqsave+0x39/0x60
> [   26.990240]        raw_spin_lock_irqsave_sdp_contention+0x5b/0xe0
> [   26.990312]        srcu_gp_start_if_needed+0x92/0x540
> [   26.990370]        bpf_selem_unlink+0x267/0x5c0
> [   26.990430]        bpf_task_storage_delete+0x3a/0x90
> [   26.990495]        bpf_prog_134dba630b11d3b7_scx_pmu_task_fini+0x26/0x2a
> [   26.990566]        bpf_prog_4b1530d9d9852432_cosmos_exit_task+0x1d/0x1f
> [   26.990636]        bpf__sched_ext_ops_exit_task+0x4b/0xa7
> [   26.990694]        scx_exit_task+0x17a/0x230
> [   26.990753]        sched_ext_dead+0xb2/0x120
> [   26.990811]        finish_task_switch.isra.0+0x305/0x370
> [   26.990870]        __schedule+0x576/0x1d60
> [   26.990917]        schedule+0x3a/0x130
> [   26.990962]        futex_do_wait+0x4a/0xa0
> [   26.991008]        __futex_wait+0x8e/0xf0
> [   26.991054]        futex_wait+0x78/0x120
> [   26.991099]        do_futex+0xc5/0x190
> [   26.991144]        __x64_sys_futex+0x12d/0x220
> [   26.991202]        do_syscall_64+0x117/0xf80
> [   26.991260]        entry_SYSCALL_64_after_hwframe+0x77/0x7f
> [   26.991318]
> [   26.991318] other info that might help us debug this:
> [   26.991318]
> [   26.991400] Chain exists of:
> [   26.991400]   rcu_tasks_trace_srcu_struct_srcu_usage.lock --> &p->pi_lock --> &rq->__lock
> [   26.991400]
> [   26.991524]  Possible unsafe locking scenario:
> [   26.991524]
> [   26.991592]        CPU0                    CPU1
> [   26.991647]        ----                    ----
> [   26.991702]   lock(&rq->__lock);
> [   26.991747]                                lock(&p->pi_lock);
> [   26.991816]                                lock(&rq->__lock);
> [   26.991884]   lock(rcu_tasks_trace_srcu_struct_srcu_usage.lock);
> [   26.991953]
> [   26.991953]  *** DEADLOCK ***
> [   26.991953]
> [   26.992021] 3 locks held by schbench/532:
> [   26.992065]  #0: ffff8df7cc154f18 (&p->pi_lock){-.-.}-{2:2}, at: _task_rq_lock+0x2c/0x100
> [   26.992151]  #1: ffff8df7fb9bdae0 (&rq->__lock){-.-.}-{2:2}, at: raw_spin_rq_lock_nested+0x24/0xb0
> [   26.992250]  #2: ffffffff9cd71b20 (rcu_read_lock){....}-{1:3}, at: __bpf_prog_enter+0x64/0x110
> [   26.992348]
> [   26.992348] stack backtrace:
> [   26.992406] CPU: 7 UID: 1000 PID: 532 Comm: schbench Not tainted 7.0.0-rc4-virtme #15 PREEMPT(full)
> [   26.992409] Hardware name: Bochs Bochs, BIOS Bochs 01/01/2011
> [   26.992411] Sched_ext: cosmos_1.1.0_g0949d453c_x86_64_unknown^_linux_gnu (enabled+all), task: runnable_at=+0ms
> [   26.992412] Call Trace:
> [   26.992414]  C<TASK>
> [   26.992415]  dump_stack_lvl+0x6f/0xb0
> [   26.992418]  print_circular_bug.cold+0x18b/0x1d6
> [   26.992422]  check_noncircular+0x165/0x190
> [   26.992425]  check_prev_add+0xe1/0xd30
> [   26.992428]  __lock_acquire+0x1561/0x1de0
> [   26.992430]  lock_acquire+0xcf/0x310
> [   26.992431]  ? raw_spin_lock_irqsave_sdp_contention+0x5b/0xe0
> [   26.992434]  _raw_spin_lock_irqsave+0x39/0x60
> [   26.992435]  ? raw_spin_lock_irqsave_sdp_contention+0x5b/0xe0
> [   26.992437]  raw_spin_lock_irqsave_sdp_contention+0x5b/0xe0
> [   26.992439]  srcu_gp_start_if_needed+0x92/0x540
> [   26.992441]  bpf_selem_unlink+0x267/0x5c0
> [   26.992443]  bpf_task_storage_delete+0x3a/0x90
> [   26.992445]  bpf_prog_134dba630b11d3b7_scx_pmu_task_fini+0x26/0x2a
> [   26.992447]  bpf_prog_4b1530d9d9852432_cosmos_exit_task+0x1d/0x1f
> [   26.992448]  bpf__sched_ext_ops_exit_task+0x4b/0xa7
> [   26.992449]  scx_exit_task+0x17a/0x230
> [   26.992451]  sched_ext_dead+0xb2/0x120
> [   26.992453]  finish_task_switch.isra.0+0x305/0x370
> [   26.992455]  __schedule+0x576/0x1d60
> [   26.992457]  ? find_held_lock+0x2b/0x80
> [   26.992460]  schedule+0x3a/0x130
> [   26.992462]  futex_do_wait+0x4a/0xa0
> [   26.992463]  __futex_wait+0x8e/0xf0
> [   26.992465]  ? __pfx_futex_wake_mark+0x10/0x10
> [   26.992468]  futex_wait+0x78/0x120
> [   26.992469]  ? find_held_lock+0x2b/0x80
> [   26.992472]  do_futex+0xc5/0x190
> [   26.992473]  __x64_sys_futex+0x12d/0x220
> [   26.992474]  ? restore_fpregs_from_fpstate+0x48/0xd0
> [   26.992477]  do_syscall_64+0x117/0xf80
> [   26.992478]  ? __irq_exit_rcu+0x38/0xc0
> [   26.992481]  entry_SYSCALL_64_after_hwframe+0x77/0x7f
> [   26.992482] RIP: 0033:0x7fe20e52eb1d

With the following on top everything looks good on my side, let me know
what you think.

Thanks,
-Andrea

From: Andrea Righi <arighi@nvidia.com>
Subject: [PATCH] bpf: Avoid circular lock dependency when deleting local
 storage

Calling bpf_task_storage_delete() from a context that holds the runqueue
lock (e.g., sched_ext's ops.exit_task() callback) can lead to a circular
lock dependency:

 WARNING: possible circular locking dependency detected
 ...
 Chain exists of:
   rcu_tasks_trace_srcu_struct_srcu_usage.lock --> &p->pi_lock --> &rq->__lock

  Possible unsafe locking scenario:

        CPU0                    CPU1
        ----                    ----
   lock(&rq->__lock);
                                lock(&p->pi_lock);
                                lock(&rq->__lock);
   lock(rcu_tasks_trace_srcu_struct_srcu_usage.lock);

  *** DEADLOCK ***

Fix by adding a reuse_now flag to bpf_selem_unlink() with the same
meaning as in bpf_selem_free() and bpf_local_storage_free(). When the
task is in the TASK_DEAD state it will not run sleepable BPF again, so
it is safe to free storage immediately via call_rcu() instead of
call_rcu_tasks_trace() and we can prevent the circular lock dependency.

Other local storage types (sk, cgrp, inode) use reuse_now=false and keep
waiting for sleepable BPF before freeing.

Signed-off-by: Andrea Righi <arighi@nvidia.com>
---
 include/linux/bpf_local_storage.h | 2 +-
 kernel/bpf/bpf_cgrp_storage.c     | 2 +-
 kernel/bpf/bpf_inode_storage.c    | 2 +-
 kernel/bpf/bpf_local_storage.c    | 6 +++---
 kernel/bpf/bpf_task_storage.c     | 7 ++++++-
 net/core/bpf_sk_storage.c         | 2 +-
 6 files changed, 13 insertions(+), 8 deletions(-)

diff --git a/include/linux/bpf_local_storage.h b/include/linux/bpf_local_storage.h
index 8157e8da61d40..f5d4159646a83 100644
--- a/include/linux/bpf_local_storage.h
+++ b/include/linux/bpf_local_storage.h
@@ -184,7 +184,7 @@ int bpf_local_storage_map_check_btf(struct bpf_map *map,
 void bpf_selem_link_storage_nolock(struct bpf_local_storage *local_storage,
 				   struct bpf_local_storage_elem *selem);
 
-int bpf_selem_unlink(struct bpf_local_storage_elem *selem);
+int bpf_selem_unlink(struct bpf_local_storage_elem *selem, bool reuse_now);
 
 int bpf_selem_link_map(struct bpf_local_storage_map *smap,
 		       struct bpf_local_storage *local_storage,
diff --git a/kernel/bpf/bpf_cgrp_storage.c b/kernel/bpf/bpf_cgrp_storage.c
index c2a2ead1f466d..853183eead2c2 100644
--- a/kernel/bpf/bpf_cgrp_storage.c
+++ b/kernel/bpf/bpf_cgrp_storage.c
@@ -89,7 +89,7 @@ static int cgroup_storage_delete(struct cgroup *cgroup, struct bpf_map *map)
 	if (!sdata)
 		return -ENOENT;
 
-	return bpf_selem_unlink(SELEM(sdata));
+	return bpf_selem_unlink(SELEM(sdata), false);
 }
 
 static long bpf_cgrp_storage_delete_elem(struct bpf_map *map, void *key)
diff --git a/kernel/bpf/bpf_inode_storage.c b/kernel/bpf/bpf_inode_storage.c
index e86734609f3d2..470f4b02c79ea 100644
--- a/kernel/bpf/bpf_inode_storage.c
+++ b/kernel/bpf/bpf_inode_storage.c
@@ -110,7 +110,7 @@ static int inode_storage_delete(struct inode *inode, struct bpf_map *map)
 	if (!sdata)
 		return -ENOENT;
 
-	return bpf_selem_unlink(SELEM(sdata));
+	return bpf_selem_unlink(SELEM(sdata), false);
 }
 
 static long bpf_fd_inode_storage_delete_elem(struct bpf_map *map, void *key)
diff --git a/kernel/bpf/bpf_local_storage.c b/kernel/bpf/bpf_local_storage.c
index 9c96a4477f81a..caa1aa5bc17c7 100644
--- a/kernel/bpf/bpf_local_storage.c
+++ b/kernel/bpf/bpf_local_storage.c
@@ -385,7 +385,7 @@ static void bpf_selem_link_map_nolock(struct bpf_local_storage_map_bucket *b,
  * Unlink an selem from map and local storage with lock held.
  * This is the common path used by local storages to delete an selem.
  */
-int bpf_selem_unlink(struct bpf_local_storage_elem *selem)
+int bpf_selem_unlink(struct bpf_local_storage_elem *selem, bool reuse_now)
 {
 	struct bpf_local_storage *local_storage;
 	bool free_local_storage = false;
@@ -419,10 +419,10 @@ int bpf_selem_unlink(struct bpf_local_storage_elem *selem)
 out:
 	raw_res_spin_unlock_irqrestore(&local_storage->lock, flags);
 
-	bpf_selem_free_list(&selem_free_list, false);
+	bpf_selem_free_list(&selem_free_list, reuse_now);
 
 	if (free_local_storage)
-		bpf_local_storage_free(local_storage, false);
+		bpf_local_storage_free(local_storage, reuse_now);
 
 	return err;
 }
diff --git a/kernel/bpf/bpf_task_storage.c b/kernel/bpf/bpf_task_storage.c
index 605506792b5b4..0311e2cd3f3e6 100644
--- a/kernel/bpf/bpf_task_storage.c
+++ b/kernel/bpf/bpf_task_storage.c
@@ -134,7 +134,12 @@ static int task_storage_delete(struct task_struct *task, struct bpf_map *map)
 	if (!sdata)
 		return -ENOENT;
 
-	return bpf_selem_unlink(SELEM(sdata));
+	/*
+	 * When the task is dead it won't run sleepable BPF again, so it is
+	 * safe to reuse storage immediately.
+	 */
+	return bpf_selem_unlink(SELEM(sdata),
+				READ_ONCE(task->__state) == TASK_DEAD);
 }
 
 static long bpf_pid_task_storage_delete_elem(struct bpf_map *map, void *key)
diff --git a/net/core/bpf_sk_storage.c b/net/core/bpf_sk_storage.c
index f8338acebf077..d20b4b5c99ef7 100644
--- a/net/core/bpf_sk_storage.c
+++ b/net/core/bpf_sk_storage.c
@@ -40,7 +40,7 @@ static int bpf_sk_storage_del(struct sock *sk, struct bpf_map *map)
 	if (!sdata)
 		return -ENOENT;
 
-	return bpf_selem_unlink(SELEM(sdata));
+	return bpf_selem_unlink(SELEM(sdata), false);
 }
 
 /* Called by __sk_destruct() & bpf_sk_storage_clone() */
-- 
2.53.0


^ permalink raw reply related	[flat|nested] 8+ messages in thread

* Re: [PATCH] bpf: Always defer local storage free
  2026-03-17  8:15     ` Andrea Righi
@ 2026-03-17 18:46       ` Cheng-Yang Chou
  2026-03-17 18:53       ` Kumar Kartikeya Dwivedi
  1 sibling, 0 replies; 8+ messages in thread
From: Cheng-Yang Chou @ 2026-03-17 18:46 UTC (permalink / raw)
  To: Andrea Righi
  Cc: Kumar Kartikeya Dwivedi, Paul E . McKenney, Alexei Starovoitov,
	Daniel Borkmann, Andrii Nakryiko, John Fastabend,
	Martin KaFai Lau, Eduard Zingerman, Song Liu, Yonghong Song,
	KP Singh, Stanislav Fomichev, Hao Luo, Jiri Olsa, Amery Hung,
	Tejun Heo, Emil Tsalapatis, bpf, sched-ext, linux-kernel,
	Ching-Chun Huang

Hi Andrea,

On Tue, Mar 17, 2026 at 09:15:49AM +0100, Andrea Righi wrote:
> From: Andrea Righi <arighi@nvidia.com>
> Subject: [PATCH] bpf: Avoid circular lock dependency when deleting local
>  storage
> 
> Calling bpf_task_storage_delete() from a context that holds the runqueue
> lock (e.g., sched_ext's ops.exit_task() callback) can lead to a circular
> lock dependency:
> 
>  WARNING: possible circular locking dependency detected
>  ...
>  Chain exists of:
>    rcu_tasks_trace_srcu_struct_srcu_usage.lock --> &p->pi_lock --> &rq->__lock
> 
>   Possible unsafe locking scenario:
> 
>         CPU0                    CPU1
>         ----                    ----
>    lock(&rq->__lock);
>                                 lock(&p->pi_lock);
>                                 lock(&rq->__lock);
>    lock(rcu_tasks_trace_srcu_struct_srcu_usage.lock);
> 
>   *** DEADLOCK ***
> 
> Fix by adding a reuse_now flag to bpf_selem_unlink() with the same
> meaning as in bpf_selem_free() and bpf_local_storage_free(). When the
> task is in the TASK_DEAD state it will not run sleepable BPF again, so
> it is safe to free storage immediately via call_rcu() instead of
> call_rcu_tasks_trace() and we can prevent the circular lock dependency.
> 
> Other local storage types (sk, cgrp, inode) use reuse_now=false and keep
> waiting for sleepable BPF before freeing.
> 
> Signed-off-by: Andrea Righi <arighi@nvidia.com>
> ---
>  include/linux/bpf_local_storage.h | 2 +-
>  kernel/bpf/bpf_cgrp_storage.c     | 2 +-
>  kernel/bpf/bpf_inode_storage.c    | 2 +-
>  kernel/bpf/bpf_local_storage.c    | 6 +++---
>  kernel/bpf/bpf_task_storage.c     | 7 ++++++-
>  net/core/bpf_sk_storage.c         | 2 +-
>  6 files changed, 13 insertions(+), 8 deletions(-)
> 
> diff --git a/include/linux/bpf_local_storage.h b/include/linux/bpf_local_storage.h
> index 8157e8da61d40..f5d4159646a83 100644
> --- a/include/linux/bpf_local_storage.h
> +++ b/include/linux/bpf_local_storage.h
> @@ -184,7 +184,7 @@ int bpf_local_storage_map_check_btf(struct bpf_map *map,
>  void bpf_selem_link_storage_nolock(struct bpf_local_storage *local_storage,
>  				   struct bpf_local_storage_elem *selem);
>  
> -int bpf_selem_unlink(struct bpf_local_storage_elem *selem);
> +int bpf_selem_unlink(struct bpf_local_storage_elem *selem, bool reuse_now);
>  
>  int bpf_selem_link_map(struct bpf_local_storage_map *smap,
>  		       struct bpf_local_storage *local_storage,
> diff --git a/kernel/bpf/bpf_cgrp_storage.c b/kernel/bpf/bpf_cgrp_storage.c
> index c2a2ead1f466d..853183eead2c2 100644
> --- a/kernel/bpf/bpf_cgrp_storage.c
> +++ b/kernel/bpf/bpf_cgrp_storage.c
> @@ -89,7 +89,7 @@ static int cgroup_storage_delete(struct cgroup *cgroup, struct bpf_map *map)
>  	if (!sdata)
>  		return -ENOENT;
>  
> -	return bpf_selem_unlink(SELEM(sdata));
> +	return bpf_selem_unlink(SELEM(sdata), false);
>  }
>  
>  static long bpf_cgrp_storage_delete_elem(struct bpf_map *map, void *key)
> diff --git a/kernel/bpf/bpf_inode_storage.c b/kernel/bpf/bpf_inode_storage.c
> index e86734609f3d2..470f4b02c79ea 100644
> --- a/kernel/bpf/bpf_inode_storage.c
> +++ b/kernel/bpf/bpf_inode_storage.c
> @@ -110,7 +110,7 @@ static int inode_storage_delete(struct inode *inode, struct bpf_map *map)
>  	if (!sdata)
>  		return -ENOENT;
>  
> -	return bpf_selem_unlink(SELEM(sdata));
> +	return bpf_selem_unlink(SELEM(sdata), false);
>  }
>  
>  static long bpf_fd_inode_storage_delete_elem(struct bpf_map *map, void *key)
> diff --git a/kernel/bpf/bpf_local_storage.c b/kernel/bpf/bpf_local_storage.c
> index 9c96a4477f81a..caa1aa5bc17c7 100644
> --- a/kernel/bpf/bpf_local_storage.c
> +++ b/kernel/bpf/bpf_local_storage.c
> @@ -385,7 +385,7 @@ static void bpf_selem_link_map_nolock(struct bpf_local_storage_map_bucket *b,
>   * Unlink an selem from map and local storage with lock held.
>   * This is the common path used by local storages to delete an selem.
>   */
> -int bpf_selem_unlink(struct bpf_local_storage_elem *selem)
> +int bpf_selem_unlink(struct bpf_local_storage_elem *selem, bool reuse_now)
>  {
>  	struct bpf_local_storage *local_storage;
>  	bool free_local_storage = false;
> @@ -419,10 +419,10 @@ int bpf_selem_unlink(struct bpf_local_storage_elem *selem)
>  out:
>  	raw_res_spin_unlock_irqrestore(&local_storage->lock, flags);
>  
> -	bpf_selem_free_list(&selem_free_list, false);
> +	bpf_selem_free_list(&selem_free_list, reuse_now);
>  
>  	if (free_local_storage)
> -		bpf_local_storage_free(local_storage, false);
> +		bpf_local_storage_free(local_storage, reuse_now);
>  
>  	return err;
>  }
> diff --git a/kernel/bpf/bpf_task_storage.c b/kernel/bpf/bpf_task_storage.c
> index 605506792b5b4..0311e2cd3f3e6 100644
> --- a/kernel/bpf/bpf_task_storage.c
> +++ b/kernel/bpf/bpf_task_storage.c
> @@ -134,7 +134,12 @@ static int task_storage_delete(struct task_struct *task, struct bpf_map *map)
>  	if (!sdata)
>  		return -ENOENT;
>  
> -	return bpf_selem_unlink(SELEM(sdata));
> +	/*
> +	 * When the task is dead it won't run sleepable BPF again, so it is
> +	 * safe to reuse storage immediately.
> +	 */
> +	return bpf_selem_unlink(SELEM(sdata),
> +				READ_ONCE(task->__state) == TASK_DEAD);
>  }
>  
>  static long bpf_pid_task_storage_delete_elem(struct bpf_map *map, void *key)
> diff --git a/net/core/bpf_sk_storage.c b/net/core/bpf_sk_storage.c
> index f8338acebf077..d20b4b5c99ef7 100644
> --- a/net/core/bpf_sk_storage.c
> +++ b/net/core/bpf_sk_storage.c
> @@ -40,7 +40,7 @@ static int bpf_sk_storage_del(struct sock *sk, struct bpf_map *map)
>  	if (!sdata)
>  		return -ENOENT;
>  
> -	return bpf_selem_unlink(SELEM(sdata));
> +	return bpf_selem_unlink(SELEM(sdata), false);
>  }
>  
>  /* Called by __sk_destruct() & bpf_sk_storage_clone() */
> -- 
> 2.53.0
> 

I was able to reproduce the error on my end using the steps you 
provided earlier. After testing with this patch applied, the error
is resolved.

Tested-by: Cheng-Yang Chou <yphbchou0911@gmail.com>

-- 
Thanks,
Cheng-Yang

^ permalink raw reply	[flat|nested] 8+ messages in thread

* Re: [PATCH] bpf: Always defer local storage free
  2026-03-17  8:15     ` Andrea Righi
  2026-03-17 18:46       ` Cheng-Yang Chou
@ 2026-03-17 18:53       ` Kumar Kartikeya Dwivedi
  2026-03-17 18:57         ` Andrea Righi
  1 sibling, 1 reply; 8+ messages in thread
From: Kumar Kartikeya Dwivedi @ 2026-03-17 18:53 UTC (permalink / raw)
  To: Andrea Righi
  Cc: Paul E . McKenney, Alexei Starovoitov, Daniel Borkmann,
	Andrii Nakryiko, John Fastabend, Martin KaFai Lau,
	Eduard Zingerman, Song Liu, Yonghong Song, KP Singh,
	Stanislav Fomichev, Hao Luo, Jiri Olsa, Amery Hung, Tejun Heo,
	Emil Tsalapatis, bpf, sched-ext, linux-kernel

On Tue, 17 Mar 2026 at 09:16, Andrea Righi <arighi@nvidia.com> wrote:
>
> On Tue, Mar 17, 2026 at 07:25:18AM +0100, Andrea Righi wrote:
> > Hi Kumar,
> >
> > On Tue, Mar 17, 2026 at 12:39:00AM +0100, Kumar Kartikeya Dwivedi wrote:
> > > On Mon, 16 Mar 2026 at 23:28, Andrea Righi <arighi@nvidia.com> wrote:
> > > >
> > > > bpf_task_storage_delete() can be invoked from contexts that hold a raw
> > > > spinlock, such as sched_ext's ops.exit_task() callback, that is running
> > > > with the rq lock held.
> > > >
> > > > The delete path eventually calls bpf_selem_unlink(), which frees the
> > > > element via bpf_selem_free_list() -> bpf_selem_free(). For task storage
> > > > with use_kmalloc_nolock, call_rcu_tasks_trace() is used, which is not
> > > > safe from raw spinlock context, triggering the following:
> > > >
> > >
> > > Paul posted [0] to fix it in SRCU. It was always safe to
> > > call_rcu_tasks_trace() under raw spin lock, but became problematic on
> > > RT with the recent conversion that uses SRCU underneath, please give
> > > [0] a spin. While I couldn't reproduce the warning using scx_cosmos, I
> > > verified that it goes away for me when calling the path from atomic
> > > context.
> > >
> > >   [0]: https://lore.kernel.org/rcu/841c8a0b-0f50-4617-98b2-76523e13b910@paulmck-laptop
> >
> > With this applied I get the following:
> >
> > [   26.986798] ======================================================
> > [   26.986883] WARNING: possible circular locking dependency detected
> > [   26.986957] 7.0.0-rc4-virtme #15 Not tainted
> > [   26.987020] ------------------------------------------------------
> > [   26.987094] schbench/532 is trying to acquire lock:
> > [   26.987155] ffffffff9cd70d90 (rcu_tasks_trace_srcu_struct_srcu_usage.lock){....}-{2:2}, at: raw_spin_lock_irqsave_sdp_contention+0x5b/0xe0
> > [   26.987313]
> > [   26.987313] but task is already holding lock:
> > [   26.987394] ffff8df7fb9bdae0 (&rq->__lock){-.-.}-{2:2}, at: raw_spin_rq_lock_nested+0x24/0xb0
> > [   26.987512]
> > [   26.987512] which lock already depends on the new lock.
> > [   26.987512]
> > [   26.987598]
> > [   26.987598] the existing dependency chain (in reverse order) is:
> > [   26.987704]
> > [   26.987704] -> #3 (&rq->__lock){-.-.}-{2:2}:
> > [   26.987779]        lock_acquire+0xcf/0x310
> > [   26.987844]        _raw_spin_lock_nested+0x2e/0x40
> > [   26.987911]        raw_spin_rq_lock_nested+0x24/0xb0
> > [   26.987973]        ___task_rq_lock+0x42/0x110
> > [   26.988034]        wake_up_new_task+0x198/0x440
> > [   26.988099]        kernel_clone+0x118/0x3c0
> > [   26.988149]        user_mode_thread+0x61/0x90
> > [   26.988222]        rest_init+0x1e/0x160
> > [   26.988272]        start_kernel+0x7a2/0x7b0
> > [   26.988329]        x86_64_start_reservations+0x24/0x30
> > [   26.988392]        x86_64_start_kernel+0xd1/0xe0
> > [   26.988451]        common_startup_64+0x13e/0x148
> > [   26.988523]
> > [   26.988523] -> #2 (&p->pi_lock){-.-.}-{2:2}:
> > [   26.988598]        lock_acquire+0xcf/0x310
> > [   26.988650]        _raw_spin_lock_irqsave+0x39/0x60
> > [   26.988718]        try_to_wake_up+0x57/0xbb0
> > [   26.988779]        create_worker+0x17e/0x200
> > [   26.988839]        workqueue_init+0x28d/0x300
> > [   26.988902]        kernel_init_freeable+0x134/0x2b0
> > [   26.988964]        kernel_init+0x1a/0x130
> > [   26.989016]        ret_from_fork+0x2bd/0x370
> > [   26.989079]        ret_from_fork_asm+0x1a/0x30
> > [   26.989143]
> > [   26.989143] -> #1 (&pool->lock){-.-.}-{2:2}:
> > [   26.989217]        lock_acquire+0xcf/0x310
> > [   26.989263]        _raw_spin_lock+0x30/0x40
> > [   26.989315]        __queue_work+0xdb/0x6d0
> > [   26.989367]        queue_delayed_work_on+0xc7/0xe0
> > [   26.989427]        srcu_gp_start_if_needed+0x3cc/0x540
> > [   26.989507]        __synchronize_srcu+0xf6/0x1b0
> > [   26.989567]        rcu_init_tasks_generic+0xfe/0x120
> > [   26.989626]        do_one_initcall+0x6f/0x300
> > [   26.989691]        kernel_init_freeable+0x24b/0x2b0
> > [   26.989750]        kernel_init+0x1a/0x130
> > [   26.989797]        ret_from_fork+0x2bd/0x370
> > [   26.989857]        ret_from_fork_asm+0x1a/0x30
> > [   26.989916]
> > [   26.989916] -> #0 (rcu_tasks_trace_srcu_struct_srcu_usage.lock){....}-{2:2}:
> > [   26.990015]        check_prev_add+0xe1/0xd30
> > [   26.990076]        __lock_acquire+0x1561/0x1de0
> > [   26.990137]        lock_acquire+0xcf/0x310
> > [   26.990182]        _raw_spin_lock_irqsave+0x39/0x60
> > [   26.990240]        raw_spin_lock_irqsave_sdp_contention+0x5b/0xe0
> > [   26.990312]        srcu_gp_start_if_needed+0x92/0x540
> > [   26.990370]        bpf_selem_unlink+0x267/0x5c0
> > [   26.990430]        bpf_task_storage_delete+0x3a/0x90
> > [   26.990495]        bpf_prog_134dba630b11d3b7_scx_pmu_task_fini+0x26/0x2a
> > [   26.990566]        bpf_prog_4b1530d9d9852432_cosmos_exit_task+0x1d/0x1f
> > [   26.990636]        bpf__sched_ext_ops_exit_task+0x4b/0xa7
> > [   26.990694]        scx_exit_task+0x17a/0x230
> > [   26.990753]        sched_ext_dead+0xb2/0x120
> > [   26.990811]        finish_task_switch.isra.0+0x305/0x370
> > [   26.990870]        __schedule+0x576/0x1d60
> > [   26.990917]        schedule+0x3a/0x130
> > [   26.990962]        futex_do_wait+0x4a/0xa0
> > [   26.991008]        __futex_wait+0x8e/0xf0
> > [   26.991054]        futex_wait+0x78/0x120
> > [   26.991099]        do_futex+0xc5/0x190
> > [   26.991144]        __x64_sys_futex+0x12d/0x220
> > [   26.991202]        do_syscall_64+0x117/0xf80
> > [   26.991260]        entry_SYSCALL_64_after_hwframe+0x77/0x7f
> > [   26.991318]
> > [   26.991318] other info that might help us debug this:
> > [   26.991318]
> > [   26.991400] Chain exists of:
> > [   26.991400]   rcu_tasks_trace_srcu_struct_srcu_usage.lock --> &p->pi_lock --> &rq->__lock
> > [   26.991400]
> > [   26.991524]  Possible unsafe locking scenario:
> > [   26.991524]
> > [   26.991592]        CPU0                    CPU1
> > [   26.991647]        ----                    ----
> > [   26.991702]   lock(&rq->__lock);
> > [   26.991747]                                lock(&p->pi_lock);
> > [   26.991816]                                lock(&rq->__lock);
> > [   26.991884]   lock(rcu_tasks_trace_srcu_struct_srcu_usage.lock);
> > [   26.991953]
> > [   26.991953]  *** DEADLOCK ***
> > [   26.991953]
> > [   26.992021] 3 locks held by schbench/532:
> > [   26.992065]  #0: ffff8df7cc154f18 (&p->pi_lock){-.-.}-{2:2}, at: _task_rq_lock+0x2c/0x100
> > [   26.992151]  #1: ffff8df7fb9bdae0 (&rq->__lock){-.-.}-{2:2}, at: raw_spin_rq_lock_nested+0x24/0xb0
> > [   26.992250]  #2: ffffffff9cd71b20 (rcu_read_lock){....}-{1:3}, at: __bpf_prog_enter+0x64/0x110
> > [   26.992348]
> > [   26.992348] stack backtrace:
> > [   26.992406] CPU: 7 UID: 1000 PID: 532 Comm: schbench Not tainted 7.0.0-rc4-virtme #15 PREEMPT(full)
> > [   26.992409] Hardware name: Bochs Bochs, BIOS Bochs 01/01/2011
> > [   26.992411] Sched_ext: cosmos_1.1.0_g0949d453c_x86_64_unknown^_linux_gnu (enabled+all), task: runnable_at=+0ms
> > [   26.992412] Call Trace:
> > [   26.992414]  C<TASK>
> > [   26.992415]  dump_stack_lvl+0x6f/0xb0
> > [   26.992418]  print_circular_bug.cold+0x18b/0x1d6
> > [   26.992422]  check_noncircular+0x165/0x190
> > [   26.992425]  check_prev_add+0xe1/0xd30
> > [   26.992428]  __lock_acquire+0x1561/0x1de0
> > [   26.992430]  lock_acquire+0xcf/0x310
> > [   26.992431]  ? raw_spin_lock_irqsave_sdp_contention+0x5b/0xe0
> > [   26.992434]  _raw_spin_lock_irqsave+0x39/0x60
> > [   26.992435]  ? raw_spin_lock_irqsave_sdp_contention+0x5b/0xe0
> > [   26.992437]  raw_spin_lock_irqsave_sdp_contention+0x5b/0xe0
> > [   26.992439]  srcu_gp_start_if_needed+0x92/0x540
> > [   26.992441]  bpf_selem_unlink+0x267/0x5c0
> > [   26.992443]  bpf_task_storage_delete+0x3a/0x90
> > [   26.992445]  bpf_prog_134dba630b11d3b7_scx_pmu_task_fini+0x26/0x2a
> > [   26.992447]  bpf_prog_4b1530d9d9852432_cosmos_exit_task+0x1d/0x1f
> > [   26.992448]  bpf__sched_ext_ops_exit_task+0x4b/0xa7
> > [   26.992449]  scx_exit_task+0x17a/0x230
> > [   26.992451]  sched_ext_dead+0xb2/0x120
> > [   26.992453]  finish_task_switch.isra.0+0x305/0x370
> > [   26.992455]  __schedule+0x576/0x1d60
> > [   26.992457]  ? find_held_lock+0x2b/0x80
> > [   26.992460]  schedule+0x3a/0x130
> > [   26.992462]  futex_do_wait+0x4a/0xa0
> > [   26.992463]  __futex_wait+0x8e/0xf0
> > [   26.992465]  ? __pfx_futex_wake_mark+0x10/0x10
> > [   26.992468]  futex_wait+0x78/0x120
> > [   26.992469]  ? find_held_lock+0x2b/0x80
> > [   26.992472]  do_futex+0xc5/0x190
> > [   26.992473]  __x64_sys_futex+0x12d/0x220
> > [   26.992474]  ? restore_fpregs_from_fpstate+0x48/0xd0
> > [   26.992477]  do_syscall_64+0x117/0xf80
> > [   26.992478]  ? __irq_exit_rcu+0x38/0xc0
> > [   26.992481]  entry_SYSCALL_64_after_hwframe+0x77/0x7f
> > [   26.992482] RIP: 0033:0x7fe20e52eb1d
>
> With the following on top everything looks good on my side, let me know
> what you think.
>
> Thanks,
> -Andrea
>
> From: Andrea Righi <arighi@nvidia.com>
> Subject: [PATCH] bpf: Avoid circular lock dependency when deleting local
>  storage
>
> Calling bpf_task_storage_delete() from a context that holds the runqueue
> lock (e.g., sched_ext's ops.exit_task() callback) can lead to a circular
> lock dependency:
>
>  WARNING: possible circular locking dependency detected
>  ...
>  Chain exists of:
>    rcu_tasks_trace_srcu_struct_srcu_usage.lock --> &p->pi_lock --> &rq->__lock
>
>   Possible unsafe locking scenario:
>
>         CPU0                    CPU1
>         ----                    ----
>    lock(&rq->__lock);
>                                 lock(&p->pi_lock);
>                                 lock(&rq->__lock);
>    lock(rcu_tasks_trace_srcu_struct_srcu_usage.lock);
>
>   *** DEADLOCK ***
>
> Fix by adding a reuse_now flag to bpf_selem_unlink() with the same
> meaning as in bpf_selem_free() and bpf_local_storage_free(). When the
> task is in the TASK_DEAD state it will not run sleepable BPF again, so
> it is safe to free storage immediately via call_rcu() instead of
> call_rcu_tasks_trace() and we can prevent the circular lock dependency.
>
> Other local storage types (sk, cgrp, inode) use reuse_now=false and keep
> waiting for sleepable BPF before freeing.
>
> Signed-off-by: Andrea Righi <arighi@nvidia.com>
> ---
> [...]

Thanks for the report Andrea. The bug noted by lockdep looks real, and
Paul agrees it is something to fix, which he will look into.

https://lore.kernel.org/rcu/fe28d664-3872-40f6-83c6-818627ad5b7d@paulmck-laptop

The fix you provided below unfortunately can't work, we cannot free
the selem immediately as the program may have formed pointers to the
local storage before calling delete, so even if the task is dead
(which is task specific anyway, we don't address other local storages)
we can still have use-after-free after we return from
bpf_task_storage_delete() back to the program. We discussed this
'instant free' optimization several times in the past for local
storage to reduce call_rcu() pressure and realized it cannot work
correctly.

So the right fix again would be in SRCU, which would be to defer the
pi->lock -> rq->lock in call_srcu() when irqs_disabled() is true. This
should address the circular deadlock when calling it under the
protection of rq->lock, such as the case you hit.

Thanks

> [...]

^ permalink raw reply	[flat|nested] 8+ messages in thread

* Re: [PATCH] bpf: Always defer local storage free
  2026-03-17 18:53       ` Kumar Kartikeya Dwivedi
@ 2026-03-17 18:57         ` Andrea Righi
  0 siblings, 0 replies; 8+ messages in thread
From: Andrea Righi @ 2026-03-17 18:57 UTC (permalink / raw)
  To: Kumar Kartikeya Dwivedi
  Cc: Paul E . McKenney, Alexei Starovoitov, Daniel Borkmann,
	Andrii Nakryiko, John Fastabend, Martin KaFai Lau,
	Eduard Zingerman, Song Liu, Yonghong Song, KP Singh,
	Stanislav Fomichev, Hao Luo, Jiri Olsa, Amery Hung, Tejun Heo,
	Emil Tsalapatis, bpf, sched-ext, linux-kernel

Hi Kumar,

On Tue, Mar 17, 2026 at 07:53:04PM +0100, Kumar Kartikeya Dwivedi wrote:
> On Tue, 17 Mar 2026 at 09:16, Andrea Righi <arighi@nvidia.com> wrote:
> >
> > On Tue, Mar 17, 2026 at 07:25:18AM +0100, Andrea Righi wrote:
> > > Hi Kumar,
> > >
> > > On Tue, Mar 17, 2026 at 12:39:00AM +0100, Kumar Kartikeya Dwivedi wrote:
> > > > On Mon, 16 Mar 2026 at 23:28, Andrea Righi <arighi@nvidia.com> wrote:
> > > > >
> > > > > bpf_task_storage_delete() can be invoked from contexts that hold a raw
> > > > > spinlock, such as sched_ext's ops.exit_task() callback, that is running
> > > > > with the rq lock held.
> > > > >
> > > > > The delete path eventually calls bpf_selem_unlink(), which frees the
> > > > > element via bpf_selem_free_list() -> bpf_selem_free(). For task storage
> > > > > with use_kmalloc_nolock, call_rcu_tasks_trace() is used, which is not
> > > > > safe from raw spinlock context, triggering the following:
> > > > >
> > > >
> > > > Paul posted [0] to fix it in SRCU. It was always safe to
> > > > call_rcu_tasks_trace() under raw spin lock, but became problematic on
> > > > RT with the recent conversion that uses SRCU underneath, please give
> > > > [0] a spin. While I couldn't reproduce the warning using scx_cosmos, I
> > > > verified that it goes away for me when calling the path from atomic
> > > > context.
> > > >
> > > >   [0]: https://lore.kernel.org/rcu/841c8a0b-0f50-4617-98b2-76523e13b910@paulmck-laptop
> > >
> > > With this applied I get the following:
> > >
> > > [   26.986798] ======================================================
> > > [   26.986883] WARNING: possible circular locking dependency detected
> > > [   26.986957] 7.0.0-rc4-virtme #15 Not tainted
> > > [   26.987020] ------------------------------------------------------
> > > [   26.987094] schbench/532 is trying to acquire lock:
> > > [   26.987155] ffffffff9cd70d90 (rcu_tasks_trace_srcu_struct_srcu_usage.lock){....}-{2:2}, at: raw_spin_lock_irqsave_sdp_contention+0x5b/0xe0
> > > [   26.987313]
> > > [   26.987313] but task is already holding lock:
> > > [   26.987394] ffff8df7fb9bdae0 (&rq->__lock){-.-.}-{2:2}, at: raw_spin_rq_lock_nested+0x24/0xb0
> > > [   26.987512]
> > > [   26.987512] which lock already depends on the new lock.
> > > [   26.987512]
> > > [   26.987598]
> > > [   26.987598] the existing dependency chain (in reverse order) is:
> > > [   26.987704]
> > > [   26.987704] -> #3 (&rq->__lock){-.-.}-{2:2}:
> > > [   26.987779]        lock_acquire+0xcf/0x310
> > > [   26.987844]        _raw_spin_lock_nested+0x2e/0x40
> > > [   26.987911]        raw_spin_rq_lock_nested+0x24/0xb0
> > > [   26.987973]        ___task_rq_lock+0x42/0x110
> > > [   26.988034]        wake_up_new_task+0x198/0x440
> > > [   26.988099]        kernel_clone+0x118/0x3c0
> > > [   26.988149]        user_mode_thread+0x61/0x90
> > > [   26.988222]        rest_init+0x1e/0x160
> > > [   26.988272]        start_kernel+0x7a2/0x7b0
> > > [   26.988329]        x86_64_start_reservations+0x24/0x30
> > > [   26.988392]        x86_64_start_kernel+0xd1/0xe0
> > > [   26.988451]        common_startup_64+0x13e/0x148
> > > [   26.988523]
> > > [   26.988523] -> #2 (&p->pi_lock){-.-.}-{2:2}:
> > > [   26.988598]        lock_acquire+0xcf/0x310
> > > [   26.988650]        _raw_spin_lock_irqsave+0x39/0x60
> > > [   26.988718]        try_to_wake_up+0x57/0xbb0
> > > [   26.988779]        create_worker+0x17e/0x200
> > > [   26.988839]        workqueue_init+0x28d/0x300
> > > [   26.988902]        kernel_init_freeable+0x134/0x2b0
> > > [   26.988964]        kernel_init+0x1a/0x130
> > > [   26.989016]        ret_from_fork+0x2bd/0x370
> > > [   26.989079]        ret_from_fork_asm+0x1a/0x30
> > > [   26.989143]
> > > [   26.989143] -> #1 (&pool->lock){-.-.}-{2:2}:
> > > [   26.989217]        lock_acquire+0xcf/0x310
> > > [   26.989263]        _raw_spin_lock+0x30/0x40
> > > [   26.989315]        __queue_work+0xdb/0x6d0
> > > [   26.989367]        queue_delayed_work_on+0xc7/0xe0
> > > [   26.989427]        srcu_gp_start_if_needed+0x3cc/0x540
> > > [   26.989507]        __synchronize_srcu+0xf6/0x1b0
> > > [   26.989567]        rcu_init_tasks_generic+0xfe/0x120
> > > [   26.989626]        do_one_initcall+0x6f/0x300
> > > [   26.989691]        kernel_init_freeable+0x24b/0x2b0
> > > [   26.989750]        kernel_init+0x1a/0x130
> > > [   26.989797]        ret_from_fork+0x2bd/0x370
> > > [   26.989857]        ret_from_fork_asm+0x1a/0x30
> > > [   26.989916]
> > > [   26.989916] -> #0 (rcu_tasks_trace_srcu_struct_srcu_usage.lock){....}-{2:2}:
> > > [   26.990015]        check_prev_add+0xe1/0xd30
> > > [   26.990076]        __lock_acquire+0x1561/0x1de0
> > > [   26.990137]        lock_acquire+0xcf/0x310
> > > [   26.990182]        _raw_spin_lock_irqsave+0x39/0x60
> > > [   26.990240]        raw_spin_lock_irqsave_sdp_contention+0x5b/0xe0
> > > [   26.990312]        srcu_gp_start_if_needed+0x92/0x540
> > > [   26.990370]        bpf_selem_unlink+0x267/0x5c0
> > > [   26.990430]        bpf_task_storage_delete+0x3a/0x90
> > > [   26.990495]        bpf_prog_134dba630b11d3b7_scx_pmu_task_fini+0x26/0x2a
> > > [   26.990566]        bpf_prog_4b1530d9d9852432_cosmos_exit_task+0x1d/0x1f
> > > [   26.990636]        bpf__sched_ext_ops_exit_task+0x4b/0xa7
> > > [   26.990694]        scx_exit_task+0x17a/0x230
> > > [   26.990753]        sched_ext_dead+0xb2/0x120
> > > [   26.990811]        finish_task_switch.isra.0+0x305/0x370
> > > [   26.990870]        __schedule+0x576/0x1d60
> > > [   26.990917]        schedule+0x3a/0x130
> > > [   26.990962]        futex_do_wait+0x4a/0xa0
> > > [   26.991008]        __futex_wait+0x8e/0xf0
> > > [   26.991054]        futex_wait+0x78/0x120
> > > [   26.991099]        do_futex+0xc5/0x190
> > > [   26.991144]        __x64_sys_futex+0x12d/0x220
> > > [   26.991202]        do_syscall_64+0x117/0xf80
> > > [   26.991260]        entry_SYSCALL_64_after_hwframe+0x77/0x7f
> > > [   26.991318]
> > > [   26.991318] other info that might help us debug this:
> > > [   26.991318]
> > > [   26.991400] Chain exists of:
> > > [   26.991400]   rcu_tasks_trace_srcu_struct_srcu_usage.lock --> &p->pi_lock --> &rq->__lock
> > > [   26.991400]
> > > [   26.991524]  Possible unsafe locking scenario:
> > > [   26.991524]
> > > [   26.991592]        CPU0                    CPU1
> > > [   26.991647]        ----                    ----
> > > [   26.991702]   lock(&rq->__lock);
> > > [   26.991747]                                lock(&p->pi_lock);
> > > [   26.991816]                                lock(&rq->__lock);
> > > [   26.991884]   lock(rcu_tasks_trace_srcu_struct_srcu_usage.lock);
> > > [   26.991953]
> > > [   26.991953]  *** DEADLOCK ***
> > > [   26.991953]
> > > [   26.992021] 3 locks held by schbench/532:
> > > [   26.992065]  #0: ffff8df7cc154f18 (&p->pi_lock){-.-.}-{2:2}, at: _task_rq_lock+0x2c/0x100
> > > [   26.992151]  #1: ffff8df7fb9bdae0 (&rq->__lock){-.-.}-{2:2}, at: raw_spin_rq_lock_nested+0x24/0xb0
> > > [   26.992250]  #2: ffffffff9cd71b20 (rcu_read_lock){....}-{1:3}, at: __bpf_prog_enter+0x64/0x110
> > > [   26.992348]
> > > [   26.992348] stack backtrace:
> > > [   26.992406] CPU: 7 UID: 1000 PID: 532 Comm: schbench Not tainted 7.0.0-rc4-virtme #15 PREEMPT(full)
> > > [   26.992409] Hardware name: Bochs Bochs, BIOS Bochs 01/01/2011
> > > [   26.992411] Sched_ext: cosmos_1.1.0_g0949d453c_x86_64_unknown^_linux_gnu (enabled+all), task: runnable_at=+0ms
> > > [   26.992412] Call Trace:
> > > [   26.992414]  C<TASK>
> > > [   26.992415]  dump_stack_lvl+0x6f/0xb0
> > > [   26.992418]  print_circular_bug.cold+0x18b/0x1d6
> > > [   26.992422]  check_noncircular+0x165/0x190
> > > [   26.992425]  check_prev_add+0xe1/0xd30
> > > [   26.992428]  __lock_acquire+0x1561/0x1de0
> > > [   26.992430]  lock_acquire+0xcf/0x310
> > > [   26.992431]  ? raw_spin_lock_irqsave_sdp_contention+0x5b/0xe0
> > > [   26.992434]  _raw_spin_lock_irqsave+0x39/0x60
> > > [   26.992435]  ? raw_spin_lock_irqsave_sdp_contention+0x5b/0xe0
> > > [   26.992437]  raw_spin_lock_irqsave_sdp_contention+0x5b/0xe0
> > > [   26.992439]  srcu_gp_start_if_needed+0x92/0x540
> > > [   26.992441]  bpf_selem_unlink+0x267/0x5c0
> > > [   26.992443]  bpf_task_storage_delete+0x3a/0x90
> > > [   26.992445]  bpf_prog_134dba630b11d3b7_scx_pmu_task_fini+0x26/0x2a
> > > [   26.992447]  bpf_prog_4b1530d9d9852432_cosmos_exit_task+0x1d/0x1f
> > > [   26.992448]  bpf__sched_ext_ops_exit_task+0x4b/0xa7
> > > [   26.992449]  scx_exit_task+0x17a/0x230
> > > [   26.992451]  sched_ext_dead+0xb2/0x120
> > > [   26.992453]  finish_task_switch.isra.0+0x305/0x370
> > > [   26.992455]  __schedule+0x576/0x1d60
> > > [   26.992457]  ? find_held_lock+0x2b/0x80
> > > [   26.992460]  schedule+0x3a/0x130
> > > [   26.992462]  futex_do_wait+0x4a/0xa0
> > > [   26.992463]  __futex_wait+0x8e/0xf0
> > > [   26.992465]  ? __pfx_futex_wake_mark+0x10/0x10
> > > [   26.992468]  futex_wait+0x78/0x120
> > > [   26.992469]  ? find_held_lock+0x2b/0x80
> > > [   26.992472]  do_futex+0xc5/0x190
> > > [   26.992473]  __x64_sys_futex+0x12d/0x220
> > > [   26.992474]  ? restore_fpregs_from_fpstate+0x48/0xd0
> > > [   26.992477]  do_syscall_64+0x117/0xf80
> > > [   26.992478]  ? __irq_exit_rcu+0x38/0xc0
> > > [   26.992481]  entry_SYSCALL_64_after_hwframe+0x77/0x7f
> > > [   26.992482] RIP: 0033:0x7fe20e52eb1d
> >
> > With the following on top everything looks good on my side, let me know
> > what you think.
> >
> > Thanks,
> > -Andrea
> >
> > From: Andrea Righi <arighi@nvidia.com>
> > Subject: [PATCH] bpf: Avoid circular lock dependency when deleting local
> >  storage
> >
> > Calling bpf_task_storage_delete() from a context that holds the runqueue
> > lock (e.g., sched_ext's ops.exit_task() callback) can lead to a circular
> > lock dependency:
> >
> >  WARNING: possible circular locking dependency detected
> >  ...
> >  Chain exists of:
> >    rcu_tasks_trace_srcu_struct_srcu_usage.lock --> &p->pi_lock --> &rq->__lock
> >
> >   Possible unsafe locking scenario:
> >
> >         CPU0                    CPU1
> >         ----                    ----
> >    lock(&rq->__lock);
> >                                 lock(&p->pi_lock);
> >                                 lock(&rq->__lock);
> >    lock(rcu_tasks_trace_srcu_struct_srcu_usage.lock);
> >
> >   *** DEADLOCK ***
> >
> > Fix by adding a reuse_now flag to bpf_selem_unlink() with the same
> > meaning as in bpf_selem_free() and bpf_local_storage_free(). When the
> > task is in the TASK_DEAD state it will not run sleepable BPF again, so
> > it is safe to free storage immediately via call_rcu() instead of
> > call_rcu_tasks_trace() and we can prevent the circular lock dependency.
> >
> > Other local storage types (sk, cgrp, inode) use reuse_now=false and keep
> > waiting for sleepable BPF before freeing.
> >
> > Signed-off-by: Andrea Righi <arighi@nvidia.com>
> > ---
> > [...]
> 
> Thanks for the report Andrea. The bug noted by lockdep looks real, and
> Paul agrees it is something to fix, which he will look into.
> 
> https://lore.kernel.org/rcu/fe28d664-3872-40f6-83c6-818627ad5b7d@paulmck-laptop

Thanks!

> 
> The fix you provided below unfortunately can't work, we cannot free
> the selem immediately as the program may have formed pointers to the
> local storage before calling delete, so even if the task is dead
> (which is task specific anyway, we don't address other local storages)
> we can still have use-after-free after we return from
> bpf_task_storage_delete() back to the program. We discussed this
> 'instant free' optimization several times in the past for local
> storage to reduce call_rcu() pressure and realized it cannot work
> correctly.
> 
> So the right fix again would be in SRCU, which would be to defer the
> pi->lock -> rq->lock in call_srcu() when irqs_disabled() is true. This
> should address the circular deadlock when calling it under the
> protection of rq->lock, such as the case you hit.

Sure, I sent that "fix" just to provide more details on the issue. :)

-Andrea

^ permalink raw reply	[flat|nested] 8+ messages in thread

end of thread, other threads:[~2026-03-17 18:57 UTC | newest]

Thread overview: 8+ messages (download: mbox.gz follow: Atom feed
-- links below jump to the message on this page --
2026-03-16 22:27 [PATCH] bpf: Always defer local storage free Andrea Righi
2026-03-16 23:07 ` bot+bpf-ci
2026-03-16 23:39 ` Kumar Kartikeya Dwivedi
2026-03-17  6:25   ` Andrea Righi
2026-03-17  8:15     ` Andrea Righi
2026-03-17 18:46       ` Cheng-Yang Chou
2026-03-17 18:53       ` Kumar Kartikeya Dwivedi
2026-03-17 18:57         ` Andrea Righi

This is a public inbox, see mirroring instructions
for how to clone and mirror all data and code used for this inbox