From: Amery Hung <ameryhung@gmail.com>
To: bpf@vger.kernel.org
Cc: netdev@vger.kernel.org, alexei.starovoitov@gmail.com,
andrii@kernel.org, daniel@iogearbox.net, memxor@gmail.com,
martin.lau@kernel.org, kpsingh@kernel.org,
yonghong.song@linux.dev, song@kernel.org, haoluo@google.com,
ameryhung@gmail.com, kernel-team@meta.com
Subject: [PATCH bpf-next v4 11/16] bpf: Switch to bpf_selem_unlink_nofail in bpf_local_storage_{map_free, destroy}
Date: Fri, 30 Jan 2026 21:09:14 -0800 [thread overview]
Message-ID: <20260131050920.2574084-12-ameryhung@gmail.com> (raw)
In-Reply-To: <20260131050920.2574084-1-ameryhung@gmail.com>
Take care of rqspinlock error in bpf_local_storage_{map_free, destroy}()
properly by switching to bpf_selem_unlink_nofail().
Both functions iterate their own RCU-protected list of selems and call
bpf_selem_unlink_nofail(). In map_free(), to prevent infinite loop when
both map_free() and destroy() fail to remove a selem from b->list
(extremely unlikely), switch to hlist_for_each_entry_rcu(). In destroy(),
also switch to hlist_for_each_entry_rcu() since we no longer iterate
local_storage->list under local_storage->lock. In addition, defer it to
workqueue as sleep may not always be possible in destroy().
Since selem, SDATA(selem)->smap and selem->local_storage may be seen by
map_free() and destroy() at the same time, protect them with RCU. This
means passing reuse_now == false to bpf_selem_free() and
bpf_local_storage_free(). The local storage map is already protected as
bpf_local_storage_map_free() waits for an RCU grace period after
iterating b->list and before freeing itself.
bpf_selem_unlink() now becomes dedicated to helpers and syscalls paths
so reuse_now should always be false. Remove it from the argument and
hardcode it.
Co-developed-by: Martin KaFai Lau <martin.lau@kernel.org>
Signed-off-by: Martin KaFai Lau <martin.lau@kernel.org>
Signed-off-by: Amery Hung <ameryhung@gmail.com>
---
include/linux/bpf_local_storage.h | 5 +-
kernel/bpf/bpf_cgrp_storage.c | 3 +-
kernel/bpf/bpf_inode_storage.c | 3 +-
kernel/bpf/bpf_local_storage.c | 96 +++++++++++++++++--------------
kernel/bpf/bpf_task_storage.c | 3 +-
net/core/bpf_sk_storage.c | 9 ++-
6 files changed, 69 insertions(+), 50 deletions(-)
diff --git a/include/linux/bpf_local_storage.h b/include/linux/bpf_local_storage.h
index 4e1ebfb3b9e8..605590a8f98d 100644
--- a/include/linux/bpf_local_storage.h
+++ b/include/linux/bpf_local_storage.h
@@ -101,6 +101,7 @@ struct bpf_local_storage {
rqspinlock_t lock; /* Protect adding/removing from the "list" */
u64 selems_size; /* Total selem size. Protected by "lock" */
refcount_t owner_refcnt;
+ struct work_struct work;
bool use_kmalloc_nolock;
};
@@ -168,7 +169,7 @@ bpf_local_storage_lookup(struct bpf_local_storage *local_storage,
return SDATA(selem);
}
-void bpf_local_storage_destroy(struct bpf_local_storage *local_storage);
+u32 bpf_local_storage_destroy(struct bpf_local_storage *local_storage);
void bpf_local_storage_map_free(struct bpf_map *map,
struct bpf_local_storage_cache *cache);
@@ -181,7 +182,7 @@ int bpf_local_storage_map_check_btf(const struct bpf_map *map,
void bpf_selem_link_storage_nolock(struct bpf_local_storage *local_storage,
struct bpf_local_storage_elem *selem);
-int bpf_selem_unlink(struct bpf_local_storage_elem *selem, bool reuse_now);
+int bpf_selem_unlink(struct bpf_local_storage_elem *selem);
int bpf_selem_link_map(struct bpf_local_storage_map *smap,
struct bpf_local_storage_elem *selem);
diff --git a/kernel/bpf/bpf_cgrp_storage.c b/kernel/bpf/bpf_cgrp_storage.c
index 853183eead2c..0bc3ab19c7b4 100644
--- a/kernel/bpf/bpf_cgrp_storage.c
+++ b/kernel/bpf/bpf_cgrp_storage.c
@@ -27,6 +27,7 @@ void bpf_cgrp_storage_free(struct cgroup *cgroup)
if (!local_storage)
goto out;
+ RCU_INIT_POINTER(cgroup->bpf_cgrp_storage, NULL);
bpf_local_storage_destroy(local_storage);
out:
rcu_read_unlock();
@@ -89,7 +90,7 @@ static int cgroup_storage_delete(struct cgroup *cgroup, struct bpf_map *map)
if (!sdata)
return -ENOENT;
- return bpf_selem_unlink(SELEM(sdata), false);
+ return bpf_selem_unlink(SELEM(sdata));
}
static long bpf_cgrp_storage_delete_elem(struct bpf_map *map, void *key)
diff --git a/kernel/bpf/bpf_inode_storage.c b/kernel/bpf/bpf_inode_storage.c
index 470f4b02c79e..eb607156ba35 100644
--- a/kernel/bpf/bpf_inode_storage.c
+++ b/kernel/bpf/bpf_inode_storage.c
@@ -68,6 +68,7 @@ void bpf_inode_storage_free(struct inode *inode)
if (!local_storage)
goto out;
+ RCU_INIT_POINTER(bsb->storage, NULL);
bpf_local_storage_destroy(local_storage);
out:
rcu_read_unlock_migrate();
@@ -110,7 +111,7 @@ static int inode_storage_delete(struct inode *inode, struct bpf_map *map)
if (!sdata)
return -ENOENT;
- return bpf_selem_unlink(SELEM(sdata), false);
+ return bpf_selem_unlink(SELEM(sdata));
}
static long bpf_fd_inode_storage_delete_elem(struct bpf_map *map, void *key)
diff --git a/kernel/bpf/bpf_local_storage.c b/kernel/bpf/bpf_local_storage.c
index 54d106ebbfe5..364198959053 100644
--- a/kernel/bpf/bpf_local_storage.c
+++ b/kernel/bpf/bpf_local_storage.c
@@ -383,7 +383,11 @@ static void bpf_selem_link_map_nolock(struct bpf_local_storage_map_bucket *b,
hlist_add_head_rcu(&selem->map_node, &b->list);
}
-int bpf_selem_unlink(struct bpf_local_storage_elem *selem, bool reuse_now)
+/*
+ * Unlink an selem from map and local storage with lock held.
+ * This is the common path used by local storages to delete an selem.
+ */
+int bpf_selem_unlink(struct bpf_local_storage_elem *selem)
{
struct bpf_local_storage *local_storage;
bool free_local_storage = false;
@@ -417,10 +421,10 @@ int bpf_selem_unlink(struct bpf_local_storage_elem *selem, bool reuse_now)
out:
raw_res_spin_unlock_irqrestore(&local_storage->lock, flags);
- bpf_selem_free_list(&selem_free_list, reuse_now);
+ bpf_selem_free_list(&selem_free_list, false);
if (free_local_storage)
- bpf_local_storage_free(local_storage, reuse_now);
+ bpf_local_storage_free(local_storage, false);
return err;
}
@@ -650,7 +654,7 @@ bpf_local_storage_update(void *owner, struct bpf_local_storage_map *smap,
local_storage = rcu_dereference_check(*owner_storage(smap, owner),
bpf_rcu_lock_held());
- if (!local_storage || hlist_empty(&local_storage->list)) {
+ if (!local_storage) {
/* Very first elem for the owner */
err = check_flags(NULL, map_flags);
if (err)
@@ -698,17 +702,6 @@ bpf_local_storage_update(void *owner, struct bpf_local_storage_map *smap,
if (err)
return ERR_PTR(err);
- /* Recheck local_storage->list under local_storage->lock */
- if (unlikely(hlist_empty(&local_storage->list))) {
- /* A parallel del is happening and local_storage is going
- * away. It has just been checked before, so very
- * unlikely. Return instead of retry to keep things
- * simple.
- */
- err = -EAGAIN;
- goto unlock;
- }
-
old_sdata = bpf_local_storage_lookup(local_storage, smap, false);
err = check_flags(old_sdata, map_flags);
if (err)
@@ -811,13 +804,16 @@ int bpf_local_storage_map_check_btf(const struct bpf_map *map,
return 0;
}
-void bpf_local_storage_destroy(struct bpf_local_storage *local_storage)
+/*
+ * Deferred looping local_storage->list to workqueue since sleeping may not be
+ * allowed in bpf_local_storage_destroy()
+ */
+static void bpf_local_storage_free_deferred(struct work_struct *work)
{
+ struct bpf_local_storage *local_storage;
struct bpf_local_storage_elem *selem;
- bool free_storage = false;
- HLIST_HEAD(free_selem_list);
- struct hlist_node *n;
- unsigned long flags;
+
+ local_storage = container_of(work, struct bpf_local_storage, work);
/* Neither the bpf_prog nor the bpf_map's syscall
* could be modifying the local_storage->list now.
@@ -828,33 +824,44 @@ void bpf_local_storage_destroy(struct bpf_local_storage *local_storage)
* when unlinking elem from the local_storage->list and
* the map's bucket->list.
*/
- raw_res_spin_lock_irqsave(&local_storage->lock, flags);
- hlist_for_each_entry_safe(selem, n, &local_storage->list, snode) {
- /* Always unlink from map before unlinking from
- * local_storage.
- */
- bpf_selem_unlink_map(selem);
- /* If local_storage list has only one element, the
- * bpf_selem_unlink_storage_nolock() will return true.
- * Otherwise, it will return false. The current loop iteration
- * intends to remove all local storage. So the last iteration
- * of the loop will set the free_cgroup_storage to true.
- */
- free_storage = bpf_selem_unlink_storage_nolock(
- local_storage, selem, &free_selem_list);
+ rcu_read_lock();
+restart:
+ hlist_for_each_entry_rcu(selem, &local_storage->list, snode) {
+ bpf_selem_unlink_nofail(selem, NULL);
+
+ if (need_resched()) {
+ cond_resched_rcu();
+ goto restart;
+ }
}
- raw_res_spin_unlock_irqrestore(&local_storage->lock, flags);
+ rcu_read_unlock();
- bpf_selem_free_list(&free_selem_list, true);
+ bpf_local_storage_free(local_storage, false);
+}
+
+/*
+ * Destroy local storage when the owner is going away. Caller must clear owner->storage
+ * and uncharge memory if memory charging is used.
+ *
+ * Since smaps associated with selems may already be gone, mem_uncharge() or
+ * owner_storage() cannot be called in this function. Let the owner (i.e., the caller)
+ * do it instead.
+ */
+u32 bpf_local_storage_destroy(struct bpf_local_storage *local_storage)
+{
+ INIT_WORK(&local_storage->work, bpf_local_storage_free_deferred);
- if (free_storage)
- bpf_local_storage_free(local_storage, true);
+ queue_work(system_dfl_wq, &local_storage->work);
if (!refcount_dec_and_test(&local_storage->owner_refcnt)) {
while (refcount_read(&local_storage->owner_refcnt))
cpu_relax();
smp_rmb(); /* pair with refcount_dec in bpf_selem_unlink_nofail */
}
+
+ local_storage->owner = NULL;
+
+ return sizeof(*local_storage) + local_storage->selems_size;
}
u64 bpf_local_storage_map_mem_usage(const struct bpf_map *map)
@@ -948,11 +955,14 @@ void bpf_local_storage_map_free(struct bpf_map *map,
rcu_read_lock();
/* No one is adding to b->list now */
- while ((selem = hlist_entry_safe(
- rcu_dereference_raw(hlist_first_rcu(&b->list)),
- struct bpf_local_storage_elem, map_node))) {
- bpf_selem_unlink(selem, true);
- cond_resched_rcu();
+restart:
+ hlist_for_each_entry_rcu(selem, &b->list, map_node) {
+ bpf_selem_unlink_nofail(selem, b);
+
+ if (need_resched()) {
+ cond_resched_rcu();
+ goto restart;
+ }
}
rcu_read_unlock();
}
diff --git a/kernel/bpf/bpf_task_storage.c b/kernel/bpf/bpf_task_storage.c
index 4d53aebe6784..ea7ea80d85e7 100644
--- a/kernel/bpf/bpf_task_storage.c
+++ b/kernel/bpf/bpf_task_storage.c
@@ -53,6 +53,7 @@ void bpf_task_storage_free(struct task_struct *task)
if (!local_storage)
goto out;
+ RCU_INIT_POINTER(task->bpf_storage, NULL);
bpf_local_storage_destroy(local_storage);
out:
rcu_read_unlock();
@@ -134,7 +135,7 @@ static int task_storage_delete(struct task_struct *task, struct bpf_map *map)
if (!sdata)
return -ENOENT;
- return bpf_selem_unlink(SELEM(sdata), false);
+ return bpf_selem_unlink(SELEM(sdata));
}
static long bpf_pid_task_storage_delete_elem(struct bpf_map *map, void *key)
diff --git a/net/core/bpf_sk_storage.c b/net/core/bpf_sk_storage.c
index 7ec8a74e7ce5..abb0e8713a04 100644
--- a/net/core/bpf_sk_storage.c
+++ b/net/core/bpf_sk_storage.c
@@ -40,20 +40,25 @@ static int bpf_sk_storage_del(struct sock *sk, struct bpf_map *map)
if (!sdata)
return -ENOENT;
- return bpf_selem_unlink(SELEM(sdata), false);
+ return bpf_selem_unlink(SELEM(sdata));
}
/* Called by __sk_destruct() & bpf_sk_storage_clone() */
void bpf_sk_storage_free(struct sock *sk)
{
struct bpf_local_storage *sk_storage;
+ u32 uncharge;
rcu_read_lock_dont_migrate();
sk_storage = rcu_dereference(sk->sk_bpf_storage);
if (!sk_storage)
goto out;
- bpf_local_storage_destroy(sk_storage);
+ RCU_INIT_POINTER(sk->sk_bpf_storage, NULL);
+
+ uncharge = bpf_local_storage_destroy(sk_storage);
+ if (uncharge)
+ atomic_sub(uncharge, &sk->sk_omem_alloc);
out:
rcu_read_unlock_migrate();
}
--
2.47.3
next prev parent reply other threads:[~2026-01-31 5:09 UTC|newest]
Thread overview: 30+ messages / expand[flat|nested] mbox.gz Atom feed top
2026-01-31 5:09 [PATCH bpf-next v4 00/16] Remove task and cgroup local storage percpu counters Amery Hung
2026-01-31 5:09 ` [PATCH bpf-next v4 01/16] bpf: Select bpf_local_storage_map_bucket based on bpf_local_storage Amery Hung
2026-01-31 5:33 ` bot+bpf-ci
2026-01-31 18:40 ` Amery Hung
2026-01-31 19:13 ` Amery Hung
2026-01-31 5:09 ` [PATCH bpf-next v4 02/16] bpf: Convert bpf_selem_unlink_map to failable Amery Hung
2026-01-31 5:33 ` bot+bpf-ci
2026-01-31 18:40 ` Amery Hung
2026-01-31 5:09 ` [PATCH bpf-next v4 03/16] bpf: Convert bpf_selem_link_map " Amery Hung
2026-01-31 5:33 ` bot+bpf-ci
2026-01-31 18:43 ` Amery Hung
2026-01-31 5:09 ` [PATCH bpf-next v4 04/16] bpf: Convert bpf_selem_unlink " Amery Hung
2026-01-31 5:09 ` [PATCH bpf-next v4 05/16] bpf: Change local_storage->lock and b->lock to rqspinlock Amery Hung
2026-01-31 5:33 ` bot+bpf-ci
2026-01-31 18:46 ` Amery Hung
2026-01-31 5:09 ` [PATCH bpf-next v4 06/16] bpf: Remove task local storage percpu counter Amery Hung
2026-01-31 5:09 ` [PATCH bpf-next v4 07/16] bpf: Remove cgroup " Amery Hung
2026-01-31 5:09 ` [PATCH bpf-next v4 08/16] bpf: Remove unused percpu counter from bpf_local_storage_map_free Amery Hung
2026-01-31 5:09 ` [PATCH bpf-next v4 09/16] bpf: Prepare for bpf_selem_unlink_nofail() Amery Hung
2026-01-31 5:09 ` [PATCH bpf-next v4 10/16] bpf: Support lockless unlink when freeing map or local storage Amery Hung
2026-01-31 5:09 ` Amery Hung [this message]
2026-01-31 5:09 ` [PATCH bpf-next v4 12/16] selftests/bpf: Update sk_storage_omem_uncharge test Amery Hung
2026-01-31 5:09 ` [PATCH bpf-next v4 13/16] selftests/bpf: Update task_local_storage/recursion test Amery Hung
2026-01-31 5:33 ` bot+bpf-ci
2026-01-31 18:49 ` Amery Hung
2026-01-31 5:09 ` [PATCH bpf-next v4 14/16] selftests/bpf: Update task_local_storage/task_storage_nodeadlock test Amery Hung
2026-01-31 5:33 ` bot+bpf-ci
2026-01-31 18:47 ` Amery Hung
2026-01-31 5:09 ` [PATCH bpf-next v4 15/16] selftests/bpf: Remove test_task_storage_map_stress_lookup Amery Hung
2026-01-31 5:09 ` [PATCH bpf-next v4 16/16] selftests/bpf: Choose another percpu variable in bpf for btf_dump test Amery Hung
Reply instructions:
You may reply publicly to this message via plain-text email
using any one of the following methods:
* Save the following mbox file, import it into your mail client,
and reply-to-all from there: mbox
Avoid top-posting and favor interleaved quoting:
https://en.wikipedia.org/wiki/Posting_style#Interleaved_style
* Reply using the --to, --cc, and --in-reply-to
switches of git-send-email(1):
git send-email \
--in-reply-to=20260131050920.2574084-12-ameryhung@gmail.com \
--to=ameryhung@gmail.com \
--cc=alexei.starovoitov@gmail.com \
--cc=andrii@kernel.org \
--cc=bpf@vger.kernel.org \
--cc=daniel@iogearbox.net \
--cc=haoluo@google.com \
--cc=kernel-team@meta.com \
--cc=kpsingh@kernel.org \
--cc=martin.lau@kernel.org \
--cc=memxor@gmail.com \
--cc=netdev@vger.kernel.org \
--cc=song@kernel.org \
--cc=yonghong.song@linux.dev \
/path/to/YOUR_REPLY
https://kernel.org/pub/software/scm/git/docs/git-send-email.html
* If your mail client supports setting the In-Reply-To header
via mailto: links, try the mailto: link
Be sure your reply has a Subject: header at the top and a blank line
before the message body.
This is a public inbox, see mirroring instructions
for how to clone and mirror all data and code used for this inbox