[PATCH bpf-next v2 1/2] bpf: Add lookup_and_delete_elem for BPF_MAP_STACK

BPF List
 help / color / mirror / Atom feed

* [PATCH bpf-next v2 1/2] bpf: Add lookup_and_delete_elem for BPF_MAP_STACK_TRACE
@ 2025-09-09 16:32 Tao Chen
  2025-09-09 16:32 ` [PATCH bpf-next v2 2/2] selftests/bpf: Add stacktrace map lookup_and_delete_elem test case Tao Chen
  2025-09-17 22:16 ` [PATCH bpf-next v2 1/2] bpf: Add lookup_and_delete_elem for BPF_MAP_STACK_TRACE Andrii Nakryiko
  0 siblings, 2 replies; 8+ messages in thread
From: Tao Chen @ 2025-09-09 16:32 UTC (permalink / raw)
  To: ast, daniel, john.fastabend, andrii, martin.lau, eddyz87, song,
	yonghong.song, kpsingh, sdf, haoluo, jolsa
  Cc: bpf, linux-kernel, Tao Chen

The stacktrace map can be easily full, which will lead to failure in
obtaining the stack. In addition to increasing the size of the map,
another solution is to delete the stack_id after looking it up from
the user, so extend the existing bpf_map_lookup_and_delete_elem()
functionality to stacktrace map types.

Signed-off-by: Tao Chen <chen.dylane@linux.dev>
---
 include/linux/bpf.h   |  2 +-
 kernel/bpf/stackmap.c | 16 ++++++++++++++--
 kernel/bpf/syscall.c  |  8 +++++---
 3 files changed, 20 insertions(+), 6 deletions(-)

Change list:
 v1 -> v2:
  - typo s/detele/delete/.(Jiri)
  - make sure following lookup fails after deleting the stack_id with NOENT.(Jiri)
  - use '&key' directly as the update value.(Jiri)
  v1: https://lore.kernel.org/bpf/20250908113622.810652-1-chen.dylane@linux.dev

diff --git a/include/linux/bpf.h b/include/linux/bpf.h
index 8f6e87f0f3a..84a41c42495 100644
--- a/include/linux/bpf.h
+++ b/include/linux/bpf.h
@@ -2703,7 +2703,7 @@ int bpf_percpu_hash_update(struct bpf_map *map, void *key, void *value,
 int bpf_percpu_array_update(struct bpf_map *map, void *key, void *value,
 			    u64 flags);
 
-int bpf_stackmap_copy(struct bpf_map *map, void *key, void *value);
+int bpf_stackmap_copy_and_delete(struct bpf_map *map, void *key, void *value, bool delete);
 
 int bpf_fd_array_map_update_elem(struct bpf_map *map, struct file *map_file,
 				 void *key, void *value, u64 map_flags);
diff --git a/kernel/bpf/stackmap.c b/kernel/bpf/stackmap.c
index 3615c06b7df..bb63a74db7b 100644
--- a/kernel/bpf/stackmap.c
+++ b/kernel/bpf/stackmap.c
@@ -646,7 +646,15 @@ static void *stack_map_lookup_elem(struct bpf_map *map, void *key)
 }
 
 /* Called from syscall */
-int bpf_stackmap_copy(struct bpf_map *map, void *key, void *value)
+static int stack_map_lookup_and_delete_elem(struct bpf_map *map, void *key,
+					    void *value, u64 flags)
+{
+	return bpf_stackmap_copy_and_delete(map, key, value, true);
+}
+
+/* Called from syscall */
+int bpf_stackmap_copy_and_delete(struct bpf_map *map, void *key, void *value,
+				 bool delete)
 {
 	struct bpf_stack_map *smap = container_of(map, struct bpf_stack_map, map);
 	struct stack_map_bucket *bucket, *old_bucket;
@@ -663,7 +671,10 @@ int bpf_stackmap_copy(struct bpf_map *map, void *key, void *value)
 	memcpy(value, bucket->data, trace_len);
 	memset(value + trace_len, 0, map->value_size - trace_len);
 
-	old_bucket = xchg(&smap->buckets[id], bucket);
+	if (delete)
+		old_bucket = bucket;
+	else
+		old_bucket = xchg(&smap->buckets[id], bucket);
 	if (old_bucket)
 		pcpu_freelist_push(&smap->freelist, &old_bucket->fnode);
 	return 0;
@@ -754,6 +765,7 @@ const struct bpf_map_ops stack_trace_map_ops = {
 	.map_free = stack_map_free,
 	.map_get_next_key = stack_map_get_next_key,
 	.map_lookup_elem = stack_map_lookup_elem,
+	.map_lookup_and_delete_elem = stack_map_lookup_and_delete_elem,
 	.map_update_elem = stack_map_update_elem,
 	.map_delete_elem = stack_map_delete_elem,
 	.map_check_btf = map_check_no_btf,
diff --git a/kernel/bpf/syscall.c b/kernel/bpf/syscall.c
index 0fbfa8532c3..d606d156c62 100644
--- a/kernel/bpf/syscall.c
+++ b/kernel/bpf/syscall.c
@@ -318,7 +318,7 @@ static int bpf_map_copy_value(struct bpf_map *map, void *key, void *value,
 	} else if (map->map_type == BPF_MAP_TYPE_PERCPU_CGROUP_STORAGE) {
 		err = bpf_percpu_cgroup_storage_copy(map, key, value);
 	} else if (map->map_type == BPF_MAP_TYPE_STACK_TRACE) {
-		err = bpf_stackmap_copy(map, key, value);
+		err = bpf_stackmap_copy_and_delete(map, key, value, false);
 	} else if (IS_FD_ARRAY(map) || IS_FD_PROG_ARRAY(map)) {
 		err = bpf_fd_array_map_lookup_elem(map, key, value);
 	} else if (IS_FD_HASH(map)) {
@@ -1627,7 +1627,8 @@ struct bpf_map *bpf_map_inc_not_zero(struct bpf_map *map)
 }
 EXPORT_SYMBOL_GPL(bpf_map_inc_not_zero);
 
-int __weak bpf_stackmap_copy(struct bpf_map *map, void *key, void *value)
+int __weak bpf_stackmap_copy_and_delete(struct bpf_map *map, void *key, void *value,
+					bool delete)
 {
 	return -ENOTSUPP;
 }
@@ -2158,7 +2159,8 @@ static int map_lookup_and_delete_elem(union bpf_attr *attr)
 	} else if (map->map_type == BPF_MAP_TYPE_HASH ||
 		   map->map_type == BPF_MAP_TYPE_PERCPU_HASH ||
 		   map->map_type == BPF_MAP_TYPE_LRU_HASH ||
-		   map->map_type == BPF_MAP_TYPE_LRU_PERCPU_HASH) {
+		   map->map_type == BPF_MAP_TYPE_LRU_PERCPU_HASH ||
+		   map->map_type == BPF_MAP_TYPE_STACK_TRACE) {
 		if (!bpf_map_is_offloaded(map)) {
 			bpf_disable_instrumentation();
 			rcu_read_lock();
-- 
2.48.1


^ permalink raw reply related	[flat|nested] 8+ messages in thread

* [PATCH bpf-next v2 2/2] selftests/bpf: Add stacktrace map lookup_and_delete_elem test case
  2025-09-09 16:32 [PATCH bpf-next v2 1/2] bpf: Add lookup_and_delete_elem for BPF_MAP_STACK_TRACE Tao Chen
@ 2025-09-09 16:32 ` Tao Chen
  2025-09-17 22:16 ` [PATCH bpf-next v2 1/2] bpf: Add lookup_and_delete_elem for BPF_MAP_STACK_TRACE Andrii Nakryiko
  1 sibling, 0 replies; 8+ messages in thread
From: Tao Chen @ 2025-09-09 16:32 UTC (permalink / raw)
  To: ast, daniel, john.fastabend, andrii, martin.lau, eddyz87, song,
	yonghong.song, kpsingh, sdf, haoluo, jolsa
  Cc: bpf, linux-kernel, Tao Chen

...
test_stacktrace_map:PASS:compare_stack_ips stackmap vs. stack_amap 0 nsec
test_stacktrace_map:PASS:stack_key_map lookup 0 nsec
test_stacktrace_map:PASS:stackmap lookup and detele 0 nsec
test_stacktrace_map:PASS:stackmap lookup deleted stack_id 0 nsec
 #397     stacktrace_map:OK
...

Signed-off-by: Tao Chen <chen.dylane@linux.dev>
---
 .../selftests/bpf/prog_tests/stacktrace_map.c | 22 ++++++++++++++++++-
 .../selftests/bpf/progs/test_stacktrace_map.c |  8 +++++++
 2 files changed, 29 insertions(+), 1 deletion(-)

diff --git a/tools/testing/selftests/bpf/prog_tests/stacktrace_map.c b/tools/testing/selftests/bpf/prog_tests/stacktrace_map.c
index 84a7e405e91..7d38afe5cfc 100644
--- a/tools/testing/selftests/bpf/prog_tests/stacktrace_map.c
+++ b/tools/testing/selftests/bpf/prog_tests/stacktrace_map.c
@@ -3,7 +3,7 @@
 
 void test_stacktrace_map(void)
 {
-	int control_map_fd, stackid_hmap_fd, stackmap_fd, stack_amap_fd;
+	int control_map_fd, stackid_hmap_fd, stackmap_fd, stack_amap_fd, stack_key_map_fd;
 	const char *prog_name = "oncpu";
 	int err, prog_fd, stack_trace_len;
 	const char *file = "./test_stacktrace_map.bpf.o";
@@ -11,6 +11,9 @@ void test_stacktrace_map(void)
 	struct bpf_program *prog;
 	struct bpf_object *obj;
 	struct bpf_link *link;
+	__u32 stack_id;
+	char val_buf[PERF_MAX_STACK_DEPTH *
+		sizeof(struct bpf_stack_build_id)];
 
 	err = bpf_prog_test_load(file, BPF_PROG_TYPE_TRACEPOINT, &obj, &prog_fd);
 	if (CHECK(err, "prog_load", "err %d errno %d\n", err, errno))
@@ -41,6 +44,10 @@ void test_stacktrace_map(void)
 	if (CHECK_FAIL(stack_amap_fd < 0))
 		goto disable_pmu;
 
+	stack_key_map_fd = bpf_find_map(__func__, obj, "stack_key_map");
+	if (CHECK_FAIL(stack_key_map_fd < 0))
+		goto disable_pmu;
+
 	/* give some time for bpf program run */
 	sleep(1);
 
@@ -68,6 +75,19 @@ void test_stacktrace_map(void)
 		  "err %d errno %d\n", err, errno))
 		goto disable_pmu;
 
+	err = bpf_map_lookup_elem(stack_key_map_fd, &key, &stack_id);
+	if (CHECK(err, "stack_key_map lookup", "err %d errno %d\n", err, errno))
+		goto disable_pmu;
+
+	err = bpf_map_lookup_and_delete_elem(stackmap_fd, &stack_id, &val_buf);
+	if (CHECK(err, "stackmap lookup and delete",
+		  "err %d errno %d\n", err, errno))
+		goto disable_pmu;
+
+	err = bpf_map_lookup_elem(stackmap_fd, &stack_id, &val_buf);
+	CHECK((!err || errno != ENOENT), "stackmap lookup deleted stack_id",
+	      "err %d errno %d\n", err, errno);
+
 disable_pmu:
 	bpf_link__destroy(link);
 close_prog:
diff --git a/tools/testing/selftests/bpf/progs/test_stacktrace_map.c b/tools/testing/selftests/bpf/progs/test_stacktrace_map.c
index 47568007b66..3bede76c151 100644
--- a/tools/testing/selftests/bpf/progs/test_stacktrace_map.c
+++ b/tools/testing/selftests/bpf/progs/test_stacktrace_map.c
@@ -38,6 +38,13 @@ struct {
 	__type(value, stack_trace_t);
 } stack_amap SEC(".maps");
 
+struct {
+	__uint(type, BPF_MAP_TYPE_ARRAY);
+	__uint(max_entries, 1);
+	__type(key, __u32);
+	__type(value, __u32);
+} stack_key_map SEC(".maps");
+
 /* taken from /sys/kernel/tracing/events/sched/sched_switch/format */
 struct sched_switch_args {
 	unsigned long long pad;
@@ -64,6 +71,7 @@ int oncpu(struct sched_switch_args *ctx)
 	/* The size of stackmap and stackid_hmap should be the same */
 	key = bpf_get_stackid(ctx, &stackmap, 0);
 	if ((int)key >= 0) {
+		bpf_map_update_elem(&stack_key_map, &val, &key, 0);
 		bpf_map_update_elem(&stackid_hmap, &key, &val, 0);
 		stack_p = bpf_map_lookup_elem(&stack_amap, &key);
 		if (stack_p)
-- 
2.48.1


^ permalink raw reply related	[flat|nested] 8+ messages in thread

* Re: [PATCH bpf-next v2 1/2] bpf: Add lookup_and_delete_elem for BPF_MAP_STACK_TRACE
  2025-09-09 16:32 [PATCH bpf-next v2 1/2] bpf: Add lookup_and_delete_elem for BPF_MAP_STACK_TRACE Tao Chen
  2025-09-09 16:32 ` [PATCH bpf-next v2 2/2] selftests/bpf: Add stacktrace map lookup_and_delete_elem test case Tao Chen
@ 2025-09-17 22:16 ` Andrii Nakryiko
  2025-09-18  1:35   ` Alexei Starovoitov
  2025-09-18 12:45   ` Tao Chen
  1 sibling, 2 replies; 8+ messages in thread
From: Andrii Nakryiko @ 2025-09-17 22:16 UTC (permalink / raw)
  To: Tao Chen
  Cc: ast, daniel, john.fastabend, andrii, martin.lau, eddyz87, song,
	yonghong.song, kpsingh, sdf, haoluo, jolsa, bpf, linux-kernel

On Tue, Sep 9, 2025 at 9:32 AM Tao Chen <chen.dylane@linux.dev> wrote:
>
> The stacktrace map can be easily full, which will lead to failure in
> obtaining the stack. In addition to increasing the size of the map,
> another solution is to delete the stack_id after looking it up from

This is not a "solution", really. Just another partially broken
workaround to an already broken STACKMAP map (and there is no fixing
it, IMO).

When a user is doing lookup_and_delete for that element, another BPF
program can reuse that slot, and user space will delete the stack that
is, effectively, still in use.

<rant>

I also just looked at the bpf_stackmap_copy() implementation, and even
lookup behavior appears broken. We temporarily remove the bucket while
copying, just to put it back a bit later. Meanwhile another BPF
program can put something into that bucket and we'll just silently
discard that new value later. That was a new one for me, but whatever,
as I said STACKMAP cannot be used reliably anyways.

</rant>

But let's stay constructive here. Some vague proposals below.

Really, STACKMAP should have used some form of refcounting and let
users put those refs, instead of just unconditionally removing the
element. I wonder if we can retrofit this and treat lookup/delete as
get/put instead? This would work well for a typical use pattern where
we send stack_id through ringbuf of some sort and user space fetches
stack trace by that ID. Each bpf_get_stackid() would be treated as
refcount bump, and each lookup_and_delete or just delete would be
treated as refcount put.

Also, it would be better for STACKMAP to be a proper hashmap and
handle collisions properly.

The above two changes would probably make STACKMAP actually usable as
"a stack trace bank" producing 4-byte IDs that are easily added to
fixed-sized ringbuf samples as an extra field. This model sometimes is
way more convenient than getting bpf_get_stack() and copying it into
ringbuf (which is currently the only way to have reliable stack traces
with BPF, IMO).

So, tl;dr. Maybe instead of pretending like we are fixing something
about STACKMAP with slightly-more-atomic (but not really)
lookup_and_delete support, maybe let's try to actually make STACKMAP
usable?.. (it's way harder than this patch, but has more value, IMO)

What does everyone think?

P.S. It seems like a good idea to switch STACKMAP to open addressing
instead of the current kind-of-bucket-chain-but-not-really
implementation. It's fixed size and pre-allocated already, so open
addressing seems like a great approach here, IMO.

> the user, so extend the existing bpf_map_lookup_and_delete_elem()
> functionality to stacktrace map types.
>
> Signed-off-by: Tao Chen <chen.dylane@linux.dev>
> ---
>  include/linux/bpf.h   |  2 +-
>  kernel/bpf/stackmap.c | 16 ++++++++++++++--
>  kernel/bpf/syscall.c  |  8 +++++---
>  3 files changed, 20 insertions(+), 6 deletions(-)
>

As for the patch in question, I think the logic is correct :) I find
bpf_stackmap_copy_and_delete() name bad and misleading, though,
because it's more of "maybe also delete". Maybe
bpf_stackmap_extract()? Don't know, it's minor nit anyways.

[...]

^ permalink raw reply	[flat|nested] 8+ messages in thread

* Re: [PATCH bpf-next v2 1/2] bpf: Add lookup_and_delete_elem for BPF_MAP_STACK_TRACE
  2025-09-17 22:16 ` [PATCH bpf-next v2 1/2] bpf: Add lookup_and_delete_elem for BPF_MAP_STACK_TRACE Andrii Nakryiko
@ 2025-09-18  1:35   ` Alexei Starovoitov
  2025-09-18 13:34     ` Tao Chen
  2025-09-18 12:45   ` Tao Chen
  1 sibling, 1 reply; 8+ messages in thread
From: Alexei Starovoitov @ 2025-09-18  1:35 UTC (permalink / raw)
  To: Andrii Nakryiko
  Cc: Tao Chen, Alexei Starovoitov, Daniel Borkmann, John Fastabend,
	Andrii Nakryiko, Martin KaFai Lau, Eduard, Song Liu,
	Yonghong Song, KP Singh, Stanislav Fomichev, Hao Luo, Jiri Olsa,
	bpf, LKML

On Wed, Sep 17, 2025 at 3:16 PM Andrii Nakryiko
<andrii.nakryiko@gmail.com> wrote:
>
>
> P.S. It seems like a good idea to switch STACKMAP to open addressing
> instead of the current kind-of-bucket-chain-but-not-really
> implementation. It's fixed size and pre-allocated already, so open
> addressing seems like a great approach here, IMO.

That makes sense. It won't have backward compat issues.
Just more reliable stack_id.

Fixed value_size is another footgun there.
Especially for collecting user stack traces.
We can switch the whole stackmap to bpf_mem_alloc()
or wait for kmalloc_nolock().
But it's probably a diminishing return.

bpf_get_stack() also isn't great with a copy into
perf_callchain_entry, then 2nd copy into on stack/percpu buf/ringbuf,
and 3rd copy of correct size into ringbuf (optional).

Also, I just realized we have another nasty race there.
In the past bpf progs were run in preempt disabled context,
but we forgot to adjust bpf_get_stack[id]() helpers when everything
switched to migrate disable.

The return value from get_perf_callchain() may be reused
if another task preempts and requests the stack.
We have partially incorrect comment in __bpf_get_stack() too:
        if (may_fault)
                rcu_read_lock(); /* need RCU for perf's callchain below */

rcu can be preemptable. so rcu_read_lock() makes
trace = get_perf_callchain(...)
accessible, but that per-cpu trace buffer can be overwritten.
It's not an issue for CONFIG_PREEMPT_NONE=y, but that doesn't
give much comfort.

Modern day bpf api would probably be
- get_callchain_entry()/put() kfuncs to expose low level mechanism
with safe acq/rel of temp buffer.
- then another kfuncs to perf_callchain_kernel/user into that buffer.

and with bpf_mem_alloc and hash kfuncs the bpf prog can
implement either bpf_get_stack() equivalent or much better
bpf_get_stackid() with variable length stack traces and so on.

^ permalink raw reply	[flat|nested] 8+ messages in thread

* Re: [PATCH bpf-next v2 1/2] bpf: Add lookup_and_delete_elem for BPF_MAP_STACK_TRACE
  2025-09-17 22:16 ` [PATCH bpf-next v2 1/2] bpf: Add lookup_and_delete_elem for BPF_MAP_STACK_TRACE Andrii Nakryiko
  2025-09-18  1:35   ` Alexei Starovoitov
@ 2025-09-18 12:45   ` Tao Chen
  1 sibling, 0 replies; 8+ messages in thread
From: Tao Chen @ 2025-09-18 12:45 UTC (permalink / raw)
  To: Andrii Nakryiko
  Cc: ast, daniel, john.fastabend, andrii, martin.lau, eddyz87, song,
	yonghong.song, kpsingh, sdf, haoluo, jolsa, bpf, linux-kernel

在 2025/9/18 06:16, Andrii Nakryiko 写道:
> On Tue, Sep 9, 2025 at 9:32 AM Tao Chen <chen.dylane@linux.dev> wrote:
>>
>> The stacktrace map can be easily full, which will lead to failure in
>> obtaining the stack. In addition to increasing the size of the map,
>> another solution is to delete the stack_id after looking it up from
> 
> This is not a "solution", really. Just another partially broken
> workaround to an already broken STACKMAP map (and there is no fixing
> it, IMO).
>

Actually it is. But in our use case, we used continuous profiling with 
perf_event, the result looks better when we got the stack_id and deleted 
it to alleviate the data size pressure in the map. And there is no high 
requirement for the accuracy of stack information for us.

> When a user is doing lookup_and_delete for that element, another BPF
> program can reuse that slot, and user space will delete the stack that
> is, effectively, still in use.
> 
> <rant>
> 
> I also just looked at the bpf_stackmap_copy() implementation, and even
> lookup behavior appears broken. We temporarily remove the bucket while
> copying, just to put it back a bit later. Meanwhile another BPF
> program can put something into that bucket and we'll just silently
> discard that new value later. That was a new one for me, but whatever,

Yes, it is a problem.

> as I said STACKMAP cannot be used reliably anyways.
> 
> </rant>
> 
> But let's stay constructive here. Some vague proposals below.
> 
> Really, STACKMAP should have used some form of refcounting and let
> users put those refs, instead of just unconditionally removing the
> element. I wonder if we can retrofit this and treat lookup/delete as
> get/put instead? This would work well for a typical use pattern where
> we send stack_id through ringbuf of some sort and user space fetches
> stack trace by that ID. Each bpf_get_stackid() would be treated as
> refcount bump, and each lookup_and_delete or just delete would be
> treated as refcount put.
> 
> Also, it would be better for STACKMAP to be a proper hashmap and
> handle collisions properly.
> 
> The above two changes would probably make STACKMAP actually usable as
> "a stack trace bank" producing 4-byte IDs that are easily added to
> fixed-sized ringbuf samples as an extra field. This model sometimes is
> way more convenient than getting bpf_get_stack() and copying it into
> ringbuf (which is currently the only way to have reliable stack traces
> with BPF, IMO).
> 
> So, tl;dr. Maybe instead of pretending like we are fixing something
> about STACKMAP with slightly-more-atomic (but not really)
> lookup_and_delete support, maybe let's try to actually make STACKMAP
> usable?.. (it's way harder than this patch, but has more value, IMO)
> 

The idea looks great. I will try to make improvements in this direction, 
though there will be certain challenges for me right now.

> What does everyone think?
> 
> P.S. It seems like a good idea to switch STACKMAP to open addressing
> instead of the current kind-of-bucket-chain-but-not-really
> implementation. It's fixed size and pre-allocated already, so open
> addressing seems like a great approach here, IMO.
> 
>> the user, so extend the existing bpf_map_lookup_and_delete_elem()
>> functionality to stacktrace map types.
>>
>> Signed-off-by: Tao Chen <chen.dylane@linux.dev>
>> ---
>>   include/linux/bpf.h   |  2 +-
>>   kernel/bpf/stackmap.c | 16 ++++++++++++++--
>>   kernel/bpf/syscall.c  |  8 +++++---
>>   3 files changed, 20 insertions(+), 6 deletions(-)
>>
> 
> As for the patch in question, I think the logic is correct :) I find
> bpf_stackmap_copy_and_delete() name bad and misleading, though,
> because it's more of "maybe also delete". Maybe
> bpf_stackmap_extract()? Don't know, it's minor nit anyways.
> 
> [...]Will rename it in v2. The original idea in this patch was just to
make it convenient for users to delete the stackid when they obtain it.

-- 
Best Regards
Tao Chen

^ permalink raw reply	[flat|nested] 8+ messages in thread

* Re: [PATCH bpf-next v2 1/2] bpf: Add lookup_and_delete_elem for BPF_MAP_STACK_TRACE
  2025-09-18  1:35   ` Alexei Starovoitov
@ 2025-09-18 13:34     ` Tao Chen
  2025-09-19  2:01       ` Alexei Starovoitov
  0 siblings, 1 reply; 8+ messages in thread
From: Tao Chen @ 2025-09-18 13:34 UTC (permalink / raw)
  To: Alexei Starovoitov, Andrii Nakryiko
  Cc: Alexei Starovoitov, Daniel Borkmann, John Fastabend,
	Andrii Nakryiko, Martin KaFai Lau, Eduard, Song Liu,
	Yonghong Song, KP Singh, Stanislav Fomichev, Hao Luo, Jiri Olsa,
	bpf, LKML

在 2025/9/18 09:35, Alexei Starovoitov 写道:
> On Wed, Sep 17, 2025 at 3:16 PM Andrii Nakryiko
> <andrii.nakryiko@gmail.com> wrote:
>>
>>
>> P.S. It seems like a good idea to switch STACKMAP to open addressing
>> instead of the current kind-of-bucket-chain-but-not-really
>> implementation. It's fixed size and pre-allocated already, so open
>> addressing seems like a great approach here, IMO.
> 
> That makes sense. It won't have backward compat issues.
> Just more reliable stack_id.
> 
> Fixed value_size is another footgun there.
> Especially for collecting user stack traces.
> We can switch the whole stackmap to bpf_mem_alloc()
> or wait for kmalloc_nolock().
> But it's probably a diminishing return.
> 
> bpf_get_stack() also isn't great with a copy into
> perf_callchain_entry, then 2nd copy into on stack/percpu buf/ringbuf,
> and 3rd copy of correct size into ringbuf (optional).
> 
> Also, I just realized we have another nasty race there.
> In the past bpf progs were run in preempt disabled context,
> but we forgot to adjust bpf_get_stack[id]() helpers when everything
> switched to migrate disable.
> 
> The return value from get_perf_callchain() may be reused
> if another task preempts and requests the stack.
> We have partially incorrect comment in __bpf_get_stack() too:
>          if (may_fault)
>                  rcu_read_lock(); /* need RCU for perf's callchain below */
> 
> rcu can be preemptable. so rcu_read_lock() makes
> trace = get_perf_callchain(...)
> accessible, but that per-cpu trace buffer can be overwritten.
> It's not an issue for CONFIG_PREEMPT_NONE=y, but that doesn't
> give much comfort.

Hi Alexei,

Can we fix it like this?

-       if (may_fault)
-               rcu_read_lock(); /* need RCU for perf's callchain below */
+       preempt_diable();

         if (trace_in)
                 trace = trace_in;
@@ -455,8 +454,7 @@ static long __bpf_get_stack(struct pt_regs *regs, 
struct task_struct *task,
                                            crosstask, false);

         if (unlikely(!trace) || trace->nr < skip) {
-               if (may_fault)
-                       rcu_read_unlock();
+               preempt_enable();
                 goto err_fault;
         }

@@ -475,9 +473,7 @@ static long __bpf_get_stack(struct pt_regs *regs, 
struct task_struct *task,
                 memcpy(buf, ips, copy_len);
         }

-       /* trace/ips should not be dereferenced after this point */
-       if (may_fault)
-               rcu_read_unlock();
+       preempt_enable();

> 
> Modern day bpf api would probably be
> - get_callchain_entry()/put() kfuncs to expose low level mechanism
> with safe acq/rel of temp buffer.
> - then another kfuncs to perf_callchain_kernel/user into that buffer.
> 
> and with bpf_mem_alloc and hash kfuncs the bpf prog can
> implement either bpf_get_stack() equivalent or much better
> bpf_get_stackid() with variable length stack traces and so on.


-- 
Best Regards
Tao Chen

^ permalink raw reply	[flat|nested] 8+ messages in thread

* Re: [PATCH bpf-next v2 1/2] bpf: Add lookup_and_delete_elem for BPF_MAP_STACK_TRACE
  2025-09-18 13:34     ` Tao Chen
@ 2025-09-19  2:01       ` Alexei Starovoitov
  2025-09-19  2:08         ` Tao Chen
  0 siblings, 1 reply; 8+ messages in thread
From: Alexei Starovoitov @ 2025-09-19  2:01 UTC (permalink / raw)
  To: Tao Chen
  Cc: Andrii Nakryiko, Alexei Starovoitov, Daniel Borkmann,
	John Fastabend, Andrii Nakryiko, Martin KaFai Lau, Eduard,
	Song Liu, Yonghong Song, KP Singh, Stanislav Fomichev, Hao Luo,
	Jiri Olsa, bpf, LKML

On Thu, Sep 18, 2025 at 6:35 AM Tao Chen <chen.dylane@linux.dev> wrote:
>
> 在 2025/9/18 09:35, Alexei Starovoitov 写道:
> > On Wed, Sep 17, 2025 at 3:16 PM Andrii Nakryiko
> > <andrii.nakryiko@gmail.com> wrote:
> >>
> >>
> >> P.S. It seems like a good idea to switch STACKMAP to open addressing
> >> instead of the current kind-of-bucket-chain-but-not-really
> >> implementation. It's fixed size and pre-allocated already, so open
> >> addressing seems like a great approach here, IMO.
> >
> > That makes sense. It won't have backward compat issues.
> > Just more reliable stack_id.
> >
> > Fixed value_size is another footgun there.
> > Especially for collecting user stack traces.
> > We can switch the whole stackmap to bpf_mem_alloc()
> > or wait for kmalloc_nolock().
> > But it's probably a diminishing return.
> >
> > bpf_get_stack() also isn't great with a copy into
> > perf_callchain_entry, then 2nd copy into on stack/percpu buf/ringbuf,
> > and 3rd copy of correct size into ringbuf (optional).
> >
> > Also, I just realized we have another nasty race there.
> > In the past bpf progs were run in preempt disabled context,
> > but we forgot to adjust bpf_get_stack[id]() helpers when everything
> > switched to migrate disable.
> >
> > The return value from get_perf_callchain() may be reused
> > if another task preempts and requests the stack.
> > We have partially incorrect comment in __bpf_get_stack() too:
> >          if (may_fault)
> >                  rcu_read_lock(); /* need RCU for perf's callchain below */
> >
> > rcu can be preemptable. so rcu_read_lock() makes
> > trace = get_perf_callchain(...)
> > accessible, but that per-cpu trace buffer can be overwritten.
> > It's not an issue for CONFIG_PREEMPT_NONE=y, but that doesn't
> > give much comfort.
>
> Hi Alexei,
>
> Can we fix it like this?
>
> -       if (may_fault)
> -               rcu_read_lock(); /* need RCU for perf's callchain below */
> +       preempt_diable();
>
>          if (trace_in)
>                  trace = trace_in;
> @@ -455,8 +454,7 @@ static long __bpf_get_stack(struct pt_regs *regs,
> struct task_struct *task,
>                                             crosstask, false);
>
>          if (unlikely(!trace) || trace->nr < skip) {
> -               if (may_fault)
> -                       rcu_read_unlock();
> +               preempt_enable();
>                  goto err_fault;
>          }
>
> @@ -475,9 +473,7 @@ static long __bpf_get_stack(struct pt_regs *regs,
> struct task_struct *task,
>                  memcpy(buf, ips, copy_len);
>          }
>
> -       /* trace/ips should not be dereferenced after this point */
> -       if (may_fault)
> -               rcu_read_unlock();
> +       preempt_enable();

That should do it. Don't see an issue at first glance.

^ permalink raw reply	[flat|nested] 8+ messages in thread

* Re: [PATCH bpf-next v2 1/2] bpf: Add lookup_and_delete_elem for BPF_MAP_STACK_TRACE
  2025-09-19  2:01       ` Alexei Starovoitov
@ 2025-09-19  2:08         ` Tao Chen
  0 siblings, 0 replies; 8+ messages in thread
From: Tao Chen @ 2025-09-19  2:08 UTC (permalink / raw)
  To: Alexei Starovoitov
  Cc: Andrii Nakryiko, Alexei Starovoitov, Daniel Borkmann,
	John Fastabend, Andrii Nakryiko, Martin KaFai Lau, Eduard,
	Song Liu, Yonghong Song, KP Singh, Stanislav Fomichev, Hao Luo,
	Jiri Olsa, bpf, LKML

在 2025/9/19 10:01, Alexei Starovoitov 写道:
> On Thu, Sep 18, 2025 at 6:35 AM Tao Chen <chen.dylane@linux.dev> wrote:
>>
>> 在 2025/9/18 09:35, Alexei Starovoitov 写道:
>>> On Wed, Sep 17, 2025 at 3:16 PM Andrii Nakryiko
>>> <andrii.nakryiko@gmail.com> wrote:
>>>>
>>>>
>>>> P.S. It seems like a good idea to switch STACKMAP to open addressing
>>>> instead of the current kind-of-bucket-chain-but-not-really
>>>> implementation. It's fixed size and pre-allocated already, so open
>>>> addressing seems like a great approach here, IMO.
>>>
>>> That makes sense. It won't have backward compat issues.
>>> Just more reliable stack_id.
>>>
>>> Fixed value_size is another footgun there.
>>> Especially for collecting user stack traces.
>>> We can switch the whole stackmap to bpf_mem_alloc()
>>> or wait for kmalloc_nolock().
>>> But it's probably a diminishing return.
>>>
>>> bpf_get_stack() also isn't great with a copy into
>>> perf_callchain_entry, then 2nd copy into on stack/percpu buf/ringbuf,
>>> and 3rd copy of correct size into ringbuf (optional).
>>>
>>> Also, I just realized we have another nasty race there.
>>> In the past bpf progs were run in preempt disabled context,
>>> but we forgot to adjust bpf_get_stack[id]() helpers when everything
>>> switched to migrate disable.
>>>
>>> The return value from get_perf_callchain() may be reused
>>> if another task preempts and requests the stack.
>>> We have partially incorrect comment in __bpf_get_stack() too:
>>>           if (may_fault)
>>>                   rcu_read_lock(); /* need RCU for perf's callchain below */
>>>
>>> rcu can be preemptable. so rcu_read_lock() makes
>>> trace = get_perf_callchain(...)
>>> accessible, but that per-cpu trace buffer can be overwritten.
>>> It's not an issue for CONFIG_PREEMPT_NONE=y, but that doesn't
>>> give much comfort.
>>
>> Hi Alexei,
>>
>> Can we fix it like this?
>>
>> -       if (may_fault)
>> -               rcu_read_lock(); /* need RCU for perf's callchain below */
>> +       preempt_diable();
>>
>>           if (trace_in)
>>                   trace = trace_in;
>> @@ -455,8 +454,7 @@ static long __bpf_get_stack(struct pt_regs *regs,
>> struct task_struct *task,
>>                                              crosstask, false);
>>
>>           if (unlikely(!trace) || trace->nr < skip) {
>> -               if (may_fault)
>> -                       rcu_read_unlock();
>> +               preempt_enable();
>>                   goto err_fault;
>>           }
>>
>> @@ -475,9 +473,7 @@ static long __bpf_get_stack(struct pt_regs *regs,
>> struct task_struct *task,
>>                   memcpy(buf, ips, copy_len);
>>           }
>>
>> -       /* trace/ips should not be dereferenced after this point */
>> -       if (may_fault)
>> -               rcu_read_unlock();
>> +       preempt_enable();
> 
> That should do it. Don't see an issue at first glance.

Ok, i will send a patch later, thanks.

-- 
Best Regards
Tao Chen

^ permalink raw reply	[flat|nested] 8+ messages in thread

end of thread, other threads:[~2025-09-19  2:08 UTC | newest]

Thread overview: 8+ messages (download: mbox.gz follow: Atom feed
-- links below jump to the message on this page --
2025-09-09 16:32 [PATCH bpf-next v2 1/2] bpf: Add lookup_and_delete_elem for BPF_MAP_STACK_TRACE Tao Chen
2025-09-09 16:32 ` [PATCH bpf-next v2 2/2] selftests/bpf: Add stacktrace map lookup_and_delete_elem test case Tao Chen
2025-09-17 22:16 ` [PATCH bpf-next v2 1/2] bpf: Add lookup_and_delete_elem for BPF_MAP_STACK_TRACE Andrii Nakryiko
2025-09-18  1:35   ` Alexei Starovoitov
2025-09-18 13:34     ` Tao Chen
2025-09-19  2:01       ` Alexei Starovoitov
2025-09-19  2:08         ` Tao Chen
2025-09-18 12:45   ` Tao Chen

This is a public inbox, see mirroring instructions
for how to clone and mirror all data and code used for this inbox