[PATCH bpf-next v2 1/2] bpf: Add preempt disable for bpf_get

public inbox for linux-kernel@vger.kernel.org
 help / color / mirror / Atom feed

* [PATCH bpf-next v2 1/2] bpf: Add preempt disable for bpf_get_stack
@ 2026-02-06  9:06 Tao Chen
  2026-02-06  9:06 ` [PATCH bpf-next v2 2/2] bpf: Add preempt disable for bpf_get_stackid Tao Chen
                   ` (2 more replies)
  0 siblings, 3 replies; 9+ messages in thread
From: Tao Chen @ 2026-02-06  9:06 UTC (permalink / raw)
  To: song, jolsa, ast, daniel, andrii, martin.lau, eddyz87,
	yonghong.song, john.fastabend, kpsingh, sdf, haoluo
  Cc: bpf, linux-kernel, Tao Chen

The get_perf_callchain() return values may be reused if a task is preempted
after the BPF program enters migrate disable mode, so we should add
preempt_disable. And as Andrii suggested, BPF can guarantee perf callchain
buffer won't be released during use, for bpf_get_stack_id, BPF stack map
will keep them alive by delaying put_callchain_buffer() until freeing time
or for bpf_get_stack/bpf_get_task_stack, BPF program itself will hold these
buffers alive again, until freeing time which is delayed until after
RCU Tasks Trace + RCU grace period.

Suggested-by: Andrii Nakryiko <andrii@kernel.org>
Signed-off-by: Tao Chen <chen.dylane@linux.dev>
---

Change list:
 - v1 -> v2
   - add preempt_disable for bpf_get_stack in patch1
   - add patch2
 - v1: https://lore.kernel.org/bpf/20260128165710.928294-1-chen.dylane@linux.dev

 kernel/bpf/stackmap.c | 13 ++++++-------
 1 file changed, 6 insertions(+), 7 deletions(-)

diff --git a/kernel/bpf/stackmap.c b/kernel/bpf/stackmap.c
index da3d328f5c1..1b100a03ef2 100644
--- a/kernel/bpf/stackmap.c
+++ b/kernel/bpf/stackmap.c
@@ -460,8 +460,8 @@ static long __bpf_get_stack(struct pt_regs *regs, struct task_struct *task,
 
 	max_depth = stack_map_calculate_max_depth(size, elem_size, flags);
 
-	if (may_fault)
-		rcu_read_lock(); /* need RCU for perf's callchain below */
+	if (!trace_in)
+		preempt_disable();
 
 	if (trace_in) {
 		trace = trace_in;
@@ -474,8 +474,8 @@ static long __bpf_get_stack(struct pt_regs *regs, struct task_struct *task,
 	}
 
 	if (unlikely(!trace) || trace->nr < skip) {
-		if (may_fault)
-			rcu_read_unlock();
+		if (!trace_in)
+			preempt_enable();
 		goto err_fault;
 	}
 
@@ -493,9 +493,8 @@ static long __bpf_get_stack(struct pt_regs *regs, struct task_struct *task,
 		memcpy(buf, ips, copy_len);
 	}
 
-	/* trace/ips should not be dereferenced after this point */
-	if (may_fault)
-		rcu_read_unlock();
+	if (!trace_in)
+		preempt_enable();
 
 	if (user_build_id)
 		stack_map_get_build_id_offset(buf, trace_nr, user, may_fault);
-- 
2.48.1


^ permalink raw reply related	[flat|nested] 9+ messages in thread

* [PATCH bpf-next v2 2/2] bpf: Add preempt disable for bpf_get_stackid
  2026-02-06  9:06 [PATCH bpf-next v2 1/2] bpf: Add preempt disable for bpf_get_stack Tao Chen
@ 2026-02-06  9:06 ` Tao Chen
  2026-02-06  9:34   ` bot+bpf-ci
  2026-02-06 17:20   ` Andrii Nakryiko
  2026-02-06 14:19 ` [syzbot ci] Re: bpf: Add preempt disable for bpf_get_stack syzbot ci
  2026-02-06 17:12 ` [PATCH bpf-next v2 1/2] " Andrii Nakryiko
  2 siblings, 2 replies; 9+ messages in thread
From: Tao Chen @ 2026-02-06  9:06 UTC (permalink / raw)
  To: song, jolsa, ast, daniel, andrii, martin.lau, eddyz87,
	yonghong.song, john.fastabend, kpsingh, sdf, haoluo
  Cc: bpf, linux-kernel, Tao Chen

The get_perf_callchain() return values may be reused if a task is preempted
after the BPF program enters migrate disable mode, so we should add
preempt_disable.
The get build-id offset in __bpf_get_stackid may increase the length
of the preempt disabled section. Luckily, it is safe to enable preempt
after perf callchain ips copied to BPF map bucket memory, so we can enable
preempt before stack_map_get_build_id_offset.

Signed-off-by: Tao Chen <chen.dylane@linux.dev>
---
 kernel/bpf/stackmap.c | 84 +++++++++++++++++++++++++++----------------
 1 file changed, 53 insertions(+), 31 deletions(-)

diff --git a/kernel/bpf/stackmap.c b/kernel/bpf/stackmap.c
index 1b100a03ef2..d263f851f08 100644
--- a/kernel/bpf/stackmap.c
+++ b/kernel/bpf/stackmap.c
@@ -246,33 +246,50 @@ get_callchain_entry_for_task(struct task_struct *task, u32 max_depth)
 #endif
 }
 
-static long __bpf_get_stackid(struct bpf_map *map,
-			      struct perf_callchain_entry *trace, u64 flags)
+static long __bpf_get_stackid(struct bpf_map *map, struct pt_regs *regs,
+			      struct perf_callchain_entry *trace_in, u64 flags)
 {
 	struct bpf_stack_map *smap = container_of(map, struct bpf_stack_map, map);
 	struct stack_map_bucket *bucket, *new_bucket, *old_bucket;
 	u32 hash, id, trace_nr, trace_len, i, max_depth;
 	u32 skip = flags & BPF_F_SKIP_FIELD_MASK;
 	bool user = flags & BPF_F_USER_STACK;
+	bool kernel = !user;
+	long ret;
 	u64 *ips;
 	bool hash_matches;
+	struct perf_callchain_entry *trace;
+
+	max_depth = stack_map_calculate_max_depth(map->value_size, stack_map_data_size(map), flags);
+	if (trace_in) {
+		trace = trace_in;
+	} else {
+		preempt_disable();
+		trace = get_perf_callchain(regs, kernel, user, max_depth, false, false, 0);
+		if (unlikely(!trace)) {
+			ret = -EFAULT;
+			goto go_out;
+		}
+	}
 
-	if (trace->nr <= skip)
+	if (trace->nr <= skip) {
 		/* skipping more than usable stack trace */
-		return -EFAULT;
+		ret = -EFAULT;
+		goto go_out;
+	}
 
-	max_depth = stack_map_calculate_max_depth(map->value_size, stack_map_data_size(map), flags);
 	trace_nr = min_t(u32, trace->nr - skip, max_depth - skip);
 	trace_len = trace_nr * sizeof(u64);
 	ips = trace->ip + skip;
 	hash = jhash2((u32 *)ips, trace_len / sizeof(u32), 0);
 	id = hash & (smap->n_buckets - 1);
+	ret = id;
 	bucket = READ_ONCE(smap->buckets[id]);
 
 	hash_matches = bucket && bucket->hash == hash;
 	/* fast cmp */
 	if (hash_matches && flags & BPF_F_FAST_STACK_CMP)
-		return id;
+		goto go_out;
 
 	if (stack_map_use_build_id(map)) {
 		struct bpf_stack_build_id *id_offs;
@@ -280,12 +297,22 @@ static long __bpf_get_stackid(struct bpf_map *map,
 		/* for build_id+offset, pop a bucket before slow cmp */
 		new_bucket = (struct stack_map_bucket *)
 			pcpu_freelist_pop(&smap->freelist);
-		if (unlikely(!new_bucket))
-			return -ENOMEM;
+		if (unlikely(!new_bucket)) {
+			ret = -ENOMEM;
+			goto go_out;
+		}
 		new_bucket->nr = trace_nr;
 		id_offs = (struct bpf_stack_build_id *)new_bucket->data;
 		for (i = 0; i < trace_nr; i++)
 			id_offs[i].ip = ips[i];
+
+		/*
+		 * It is safe after perf callchain ips copied to bucket buffer
+		 * to reduce the length of preempt section, we can enable preempt here.
+		 */
+		if (!trace_in)
+			preempt_enable();
+
 		stack_map_get_build_id_offset(id_offs, trace_nr, user, false /* !may_fault */);
 		trace_len = trace_nr * sizeof(struct bpf_stack_build_id);
 		if (hash_matches && bucket->nr == trace_nr &&
@@ -300,14 +327,19 @@ static long __bpf_get_stackid(struct bpf_map *map,
 	} else {
 		if (hash_matches && bucket->nr == trace_nr &&
 		    memcmp(bucket->data, ips, trace_len) == 0)
-			return id;
-		if (bucket && !(flags & BPF_F_REUSE_STACKID))
-			return -EEXIST;
+			goto go_out;
+
+		if (bucket && !(flags & BPF_F_REUSE_STACKID)) {
+			ret = -EEXIST;
+			goto go_out;
+		}
 
 		new_bucket = (struct stack_map_bucket *)
 			pcpu_freelist_pop(&smap->freelist);
-		if (unlikely(!new_bucket))
-			return -ENOMEM;
+		if (unlikely(!new_bucket)) {
+			ret = -ENOMEM;
+			goto go_out;
+		}
 		memcpy(new_bucket->data, ips, trace_len);
 	}
 
@@ -317,31 +349,21 @@ static long __bpf_get_stackid(struct bpf_map *map,
 	old_bucket = xchg(&smap->buckets[id], new_bucket);
 	if (old_bucket)
 		pcpu_freelist_push(&smap->freelist, &old_bucket->fnode);
-	return id;
+
+go_out:
+	if (!trace_in)
+		preempt_enable();
+	return ret;
 }
 
 BPF_CALL_3(bpf_get_stackid, struct pt_regs *, regs, struct bpf_map *, map,
 	   u64, flags)
 {
-	u32 elem_size = stack_map_data_size(map);
-	bool user = flags & BPF_F_USER_STACK;
-	struct perf_callchain_entry *trace;
-	bool kernel = !user;
-	u32 max_depth;
-
 	if (unlikely(flags & ~(BPF_F_SKIP_FIELD_MASK | BPF_F_USER_STACK |
 			       BPF_F_FAST_STACK_CMP | BPF_F_REUSE_STACKID)))
 		return -EINVAL;
 
-	max_depth = stack_map_calculate_max_depth(map->value_size, elem_size, flags);
-	trace = get_perf_callchain(regs, kernel, user, max_depth,
-				   false, false, 0);
-
-	if (unlikely(!trace))
-		/* couldn't fetch the stack trace */
-		return -EFAULT;
-
-	return __bpf_get_stackid(map, trace, flags);
+	return __bpf_get_stackid(map, regs, NULL, flags);
 }
 
 const struct bpf_func_proto bpf_get_stackid_proto = {
@@ -395,7 +417,7 @@ BPF_CALL_3(bpf_get_stackid_pe, struct bpf_perf_event_data_kern *, ctx,
 
 	if (kernel) {
 		trace->nr = nr_kernel;
-		ret = __bpf_get_stackid(map, trace, flags);
+		ret = __bpf_get_stackid(map, NULL, trace, flags);
 	} else { /* user */
 		u64 skip = flags & BPF_F_SKIP_FIELD_MASK;
 
@@ -404,7 +426,7 @@ BPF_CALL_3(bpf_get_stackid_pe, struct bpf_perf_event_data_kern *, ctx,
 			return -EFAULT;
 
 		flags = (flags & ~BPF_F_SKIP_FIELD_MASK) | skip;
-		ret = __bpf_get_stackid(map, trace, flags);
+		ret = __bpf_get_stackid(map, NULL, trace, flags);
 	}
 
 	/* restore nr */
-- 
2.48.1


^ permalink raw reply related	[flat|nested] 9+ messages in thread

* Re: [PATCH bpf-next v2 2/2] bpf: Add preempt disable for bpf_get_stackid
  2026-02-06  9:06 ` [PATCH bpf-next v2 2/2] bpf: Add preempt disable for bpf_get_stackid Tao Chen
@ 2026-02-06  9:34   ` bot+bpf-ci
  2026-02-06  9:58     ` Tao Chen
  2026-02-06 17:20   ` Andrii Nakryiko
  1 sibling, 1 reply; 9+ messages in thread
From: bot+bpf-ci @ 2026-02-06  9:34 UTC (permalink / raw)
  To: chen.dylane, song, jolsa, ast, daniel, andrii, martin.lau,
	eddyz87, yonghong.song, john.fastabend, kpsingh, sdf, haoluo
  Cc: bpf, linux-kernel, chen.dylane, ast, andrii, daniel, martin.lau,
	eddyz87, yonghong.song, clm, ihor.solodrai

[-- Attachment #1: Type: text/plain, Size: 3011 bytes --]

> diff --git a/kernel/bpf/stackmap.c b/kernel/bpf/stackmap.c
> index 1b100a03ef2e..d263f851f084 100644
> --- a/kernel/bpf/stackmap.c
> +++ b/kernel/bpf/stackmap.c

[ ... ]

> @@ -280,12 +297,22 @@ static long __bpf_get_stackid(struct bpf_map *map,
>  		/* for build_id+offset, pop a bucket before slow cmp */
>  		new_bucket = (struct stack_map_bucket *)
>  			pcpu_freelist_pop(&smap->freelist);
> -		if (unlikely(!new_bucket))
> -			return -ENOMEM;
> +		if (unlikely(!new_bucket)) {
> +			ret = -ENOMEM;
> +			goto go_out;
> +		}
>  		new_bucket->nr = trace_nr;
>  		id_offs = (struct bpf_stack_build_id *)new_bucket->data;
>  		for (i = 0; i < trace_nr; i++)
>  			id_offs[i].ip = ips[i];
> +
> +		/*
> +		 * It is safe after perf callchain ips copied to bucket buffer
> +		 * to reduce the length of preempt section, we can enable preempt here.
> +		 */
> +		if (!trace_in)
> +			preempt_enable();
> +

When the stack_map_use_build_id() path falls through to insert a new entry
(neither the hash_matches early return nor the -EEXIST early return is
taken), this preempt_enable() is called here, and then the code continues
to go_out where preempt_enable() is called again.

The sequence when trace_in == NULL:

    preempt_disable()                   // entering __bpf_get_stackid
    ...
    if (!trace_in)
        preempt_enable();               // here, after copying IPs
    ...
    // fall through to insert new entry
    new_bucket->hash = hash;
    ...
    xchg(&smap->buckets[id], new_bucket);
    ...
go_out:
    if (!trace_in)
        preempt_enable();               // second enable - imbalanced

The two direct returns (return id and return -EEXIST) after the early
preempt_enable() correctly bypass go_out. However, when inserting a new
stack trace (the normal case when no exact match exists and the bucket
can be reused), the code falls through to go_out and calls preempt_enable()
a second time.

Could this cause preempt_count underflow when using BPF_F_STACK_BUILD_ID
maps with bpf_get_stackid()?

> @@ -317,31 +349,21 @@ static long __bpf_get_stackid(struct bpf_map *map,
>  	old_bucket = xchg(&smap->buckets[id], new_bucket);
>  	if (old_bucket)
>  		pcpu_freelist_push(&smap->freelist, &old_bucket->fnode);
> -	return id;
> +
> +go_out:
> +	if (!trace_in)
> +		preempt_enable();
> +	return ret;
>  }

---
AI reviewed your patch. Please fix the bug or email reply why it's not a bug.
See: https://github.com/kernel-patches/vmtest/blob/master/ci/claude/README.md

CI run summary: https://github.com/kernel-patches/bpf/actions/runs/21745231308

AI-authorship-score: low
AI-authorship-explanation: The commit shows typical human patterns including a control-flow bug from manual refactoring that AI would likely catch, combined with natural but slightly awkward phrasing in comments.
issues-found: 1
issue-severity-score: high
issue-severity-explanation: Preempt count underflow in the build_id path causes kernel warnings and potential system instability when inserting new stack traces.

^ permalink raw reply	[flat|nested] 9+ messages in thread

* Re: [PATCH bpf-next v2 2/2] bpf: Add preempt disable for bpf_get_stackid
  2026-02-06  9:34   ` bot+bpf-ci
@ 2026-02-06  9:58     ` Tao Chen
  0 siblings, 0 replies; 9+ messages in thread
From: Tao Chen @ 2026-02-06  9:58 UTC (permalink / raw)
  To: bot+bpf-ci, song, jolsa, ast, daniel, andrii, martin.lau, eddyz87,
	yonghong.song, john.fastabend, kpsingh, sdf, haoluo
  Cc: bpf, linux-kernel, martin.lau, clm, ihor.solodrai

在 2026/2/6 17:34, bot+bpf-ci@kernel.org 写道:
>> diff --git a/kernel/bpf/stackmap.c b/kernel/bpf/stackmap.c
>> index 1b100a03ef2e..d263f851f084 100644
>> --- a/kernel/bpf/stackmap.c
>> +++ b/kernel/bpf/stackmap.c
> 
> [ ... ]
> 
>> @@ -280,12 +297,22 @@ static long __bpf_get_stackid(struct bpf_map *map,
>>   		/* for build_id+offset, pop a bucket before slow cmp */
>>   		new_bucket = (struct stack_map_bucket *)
>>   			pcpu_freelist_pop(&smap->freelist);
>> -		if (unlikely(!new_bucket))
>> -			return -ENOMEM;
>> +		if (unlikely(!new_bucket)) {
>> +			ret = -ENOMEM;
>> +			goto go_out;
>> +		}
>>   		new_bucket->nr = trace_nr;
>>   		id_offs = (struct bpf_stack_build_id *)new_bucket->data;
>>   		for (i = 0; i < trace_nr; i++)
>>   			id_offs[i].ip = ips[i];
>> +
>> +		/*
>> +		 * It is safe after perf callchain ips copied to bucket buffer
>> +		 * to reduce the length of preempt section, we can enable preempt here.
>> +		 */
>> +		if (!trace_in)
>> +			preempt_enable();
>> +
> 
> When the stack_map_use_build_id() path falls through to insert a new entry
> (neither the hash_matches early return nor the -EEXIST early return is
> taken), this preempt_enable() is called here, and then the code continues
> to go_out where preempt_enable() is called again.
> 
> The sequence when trace_in == NULL:
> 
>      preempt_disable()                   // entering __bpf_get_stackid
>      ...
>      if (!trace_in)
>          preempt_enable();               // here, after copying IPs
>      ...
>      // fall through to insert new entry
>      new_bucket->hash = hash;
>      ...
>      xchg(&smap->buckets[id], new_bucket);
>      ...
> go_out:
>      if (!trace_in)
>          preempt_enable();               // second enable - imbalanced
> 
> The two direct returns (return id and return -EEXIST) after the early
> preempt_enable() correctly bypass go_out. However, when inserting a new
> stack trace (the normal case when no exact match exists and the bucket
> can be reused), the code falls through to go_out and calls preempt_enable()
> a second time.
> 

will fix it in v3, and wait for other comments from matainers.

> Could this cause preempt_count underflow when using BPF_F_STACK_BUILD_ID
> maps with bpf_get_stackid()?
> 
>> @@ -317,31 +349,21 @@ static long __bpf_get_stackid(struct bpf_map *map,
>>   	old_bucket = xchg(&smap->buckets[id], new_bucket);
>>   	if (old_bucket)
>>   		pcpu_freelist_push(&smap->freelist, &old_bucket->fnode);
>> -	return id;
>> +
>> +go_out:
>> +	if (!trace_in)
>> +		preempt_enable();
>> +	return ret;
>>   }
> 
> 
> ---
> AI reviewed your patch. Please fix the bug or email reply why it's not a bug.
> See: https://github.com/kernel-patches/vmtest/blob/master/ci/claude/README.md
> 
> CI run summary: https://github.com/kernel-patches/bpf/actions/runs/21745231308
> 
> AI-authorship-score: low
> AI-authorship-explanation: The commit shows typical human patterns including a control-flow bug from manual refactoring that AI would likely catch, combined with natural but slightly awkward phrasing in comments.
> issues-found: 1
> issue-severity-score: high
> issue-severity-explanation: Preempt count underflow in the build_id path causes kernel warnings and potential system instability when inserting new stack traces.


-- 
Best Regards
Tao Chen

^ permalink raw reply	[flat|nested] 9+ messages in thread

* Re: [PATCH bpf-next v2 2/2] bpf: Add preempt disable for bpf_get_stackid
  2026-02-06  9:06 ` [PATCH bpf-next v2 2/2] bpf: Add preempt disable for bpf_get_stackid Tao Chen
  2026-02-06  9:34   ` bot+bpf-ci
@ 2026-02-06 17:20   ` Andrii Nakryiko
  2026-02-11  7:18     ` Tao Chen
  1 sibling, 1 reply; 9+ messages in thread
From: Andrii Nakryiko @ 2026-02-06 17:20 UTC (permalink / raw)
  To: Tao Chen
  Cc: song, jolsa, ast, daniel, andrii, martin.lau, eddyz87,
	yonghong.song, john.fastabend, kpsingh, sdf, haoluo, bpf,
	linux-kernel

On Fri, Feb 6, 2026 at 1:07 AM Tao Chen <chen.dylane@linux.dev> wrote:
>
> The get_perf_callchain() return values may be reused if a task is preempted
> after the BPF program enters migrate disable mode, so we should add
> preempt_disable.
> The get build-id offset in __bpf_get_stackid may increase the length
> of the preempt disabled section. Luckily, it is safe to enable preempt
> after perf callchain ips copied to BPF map bucket memory, so we can enable
> preempt before stack_map_get_build_id_offset.
>
> Signed-off-by: Tao Chen <chen.dylane@linux.dev>
> ---
>  kernel/bpf/stackmap.c | 84 +++++++++++++++++++++++++++----------------
>  1 file changed, 53 insertions(+), 31 deletions(-)
>

Let's take a step back and look at the bigger picture. This whole
trace_in thing is still problematic because one way or another we do
get perf_callchain_entry, which *needs* preemption disable. So we need
to refactor __bpf_get_stackid() in such a way that we can do all these
steps separately:

a) get stackmap entry (pcpu_freelist_pop parts)
b) (under disabled preemption) get temporary per-cpu callchain_entry
and copy its IPs into stackmap entry
c) (now with preemption enabled) perform build ID fetch and transformation

Current logic is too coupled together to allow this, but conceptually
there is nothing preventing us from breaking __bpf_get_stackid() (and
see my replies in previous email, we should do similar breakup for
__bpf_get_stack) into few separate steps.

We have to do this to make all of this work in all the combinations of
sleepable/non-sleepable and with/without build id, IMO.

Take your time, don't rush, think this through. This is a bit of a
maze of code, we should untangle it properly, not hack our way through
it.

[...]

^ permalink raw reply	[flat|nested] 9+ messages in thread

* Re: [PATCH bpf-next v2 2/2] bpf: Add preempt disable for bpf_get_stackid
  2026-02-06 17:20   ` Andrii Nakryiko
@ 2026-02-11  7:18     ` Tao Chen
  0 siblings, 0 replies; 9+ messages in thread
From: Tao Chen @ 2026-02-11  7:18 UTC (permalink / raw)
  To: Andrii Nakryiko
  Cc: song, jolsa, ast, daniel, andrii, martin.lau, eddyz87,
	yonghong.song, john.fastabend, kpsingh, sdf, haoluo, bpf,
	linux-kernel

在 2026/2/7 01:20, Andrii Nakryiko 写道:
> On Fri, Feb 6, 2026 at 1:07 AM Tao Chen <chen.dylane@linux.dev> wrote:
>>
>> The get_perf_callchain() return values may be reused if a task is preempted
>> after the BPF program enters migrate disable mode, so we should add
>> preempt_disable.
>> The get build-id offset in __bpf_get_stackid may increase the length
>> of the preempt disabled section. Luckily, it is safe to enable preempt
>> after perf callchain ips copied to BPF map bucket memory, so we can enable
>> preempt before stack_map_get_build_id_offset.
>>
>> Signed-off-by: Tao Chen <chen.dylane@linux.dev>
>> ---
>>   kernel/bpf/stackmap.c | 84 +++++++++++++++++++++++++++----------------
>>   1 file changed, 53 insertions(+), 31 deletions(-)
>>
> 
> Let's take a step back and look at the bigger picture. This whole
> trace_in thing is still problematic because one way or another we do
> get perf_callchain_entry, which *needs* preemption disable. So we need
> to refactor __bpf_get_stackid() in such a way that we can do all these
> steps separately:
> 
> a) get stackmap entry (pcpu_freelist_pop parts)
> b) (under disabled preemption) get temporary per-cpu callchain_entry
> and copy its IPs into stackmap entry
> c) (now with preemption enabled) perform build ID fetch and transformation
> 

Ok, I've tried to follow this approach to make the code clearer. Thanks.

> Current logic is too coupled together to allow this, but conceptually
> there is nothing preventing us from breaking __bpf_get_stackid() (and
> see my replies in previous email, we should do similar breakup for
> __bpf_get_stack) into few separate steps.
> 
> We have to do this to make all of this work in all the combinations of
> sleepable/non-sleepable and with/without build id, IMO.
> 
> Take your time, don't rush, think this through. This is a bit of a
> maze of code, we should untangle it properly, not hack our way through
> it.
> 
> [...]


-- 
Best Regards
Tao Chen

^ permalink raw reply	[flat|nested] 9+ messages in thread

* [syzbot ci] Re: bpf: Add preempt disable for bpf_get_stack
  2026-02-06  9:06 [PATCH bpf-next v2 1/2] bpf: Add preempt disable for bpf_get_stack Tao Chen
  2026-02-06  9:06 ` [PATCH bpf-next v2 2/2] bpf: Add preempt disable for bpf_get_stackid Tao Chen
@ 2026-02-06 14:19 ` syzbot ci
  2026-02-06 17:12 ` [PATCH bpf-next v2 1/2] " Andrii Nakryiko
  2 siblings, 0 replies; 9+ messages in thread
From: syzbot ci @ 2026-02-06 14:19 UTC (permalink / raw)
  To: andrii, ast, bpf, chen.dylane, daniel, eddyz87, haoluo,
	john.fastabend, jolsa, kpsingh, linux-kernel, martin.lau, sdf,
	song, yonghong.song
  Cc: syzbot, syzkaller-bugs

syzbot ci has tested the following series

[v2] bpf: Add preempt disable for bpf_get_stack
https://lore.kernel.org/all/20260206090653.1336687-1-chen.dylane@linux.dev
* [PATCH bpf-next v2 1/2] bpf: Add preempt disable for bpf_get_stack
* [PATCH bpf-next v2 2/2] bpf: Add preempt disable for bpf_get_stackid

and found the following issue:
WARNING in preempt_count_sub

Full report is available here:
https://ci.syzbot.org/series/90d08df2-d19d-404f-b9dd-c201605b83a2

***

WARNING in preempt_count_sub

tree:      bpf-next
URL:       https://kernel.googlesource.com/pub/scm/linux/kernel/git/bpf/bpf-next.git
base:      1ace9bac1ad2bc6a0a70baaa16d22b7e783e88c5
arch:      amd64
compiler:  Debian clang version 21.1.8 (++20251221033036+2078da43e25a-1~exp1~20251221153213.50), Debian LLD 21.1.8
config:    https://ci.syzbot.org/builds/64881591-241a-4854-9beb-1f691566835c/config
C repro:   https://ci.syzbot.org/findings/98f347c8-b879-4c85-829b-a20ef900e645/c_repro
syz repro: https://ci.syzbot.org/findings/98f347c8-b879-4c85-829b-a20ef900e645/syz_repro

------------[ cut here ]------------
DEBUG_LOCKS_WARN_ON(val > preempt_count())
WARNING: kernel/sched/core.c:5751 at preempt_count_sub+0x9e/0x170 kernel/sched/core.c:5751, CPU#0: syz.0.17/5991
Modules linked in:
CPU: 0 UID: 0 PID: 5991 Comm: syz.0.17 Not tainted syzkaller #0 PREEMPT(full) 
Hardware name: QEMU Standard PC (Q35 + ICH9, 2009), BIOS 1.16.2-debian-1.16.2-1 04/01/2014
RIP: 0010:preempt_count_sub+0xa5/0x170 kernel/sched/core.c:5751
Code: 05 cf 8f 48 c1 e8 03 0f b6 04 18 84 c0 0f 85 88 00 00 00 83 3d ef 21 3d 0e 00 75 13 48 8d 3d b2 f3 3f 0e 48 c7 c6 a0 fe 8b 8b <67> 48 0f b9 3a 90 eb b8 90 e8 5d ab 03 03 85 c0 74 2f 48 c7 c0 a4
RSP: 0018:ffffc90004437bc8 EFLAGS: 00010246
RAX: 0000000000000000 RBX: dffffc0000000000 RCX: ffff8881746f8000
RDX: 0000000000000000 RSI: ffffffff8b8bfea0 RDI: ffffffff8fd1d770
RBP: ffffc90004437cd0 R08: ffffffff8fcf05a3 R09: 1ffffffff1f9e0b4
R10: dffffc0000000000 R11: fffffbfff1f9e0b5 R12: dffffc0000000000
R13: 1ffff92000886f84 R14: 0000000000000000 R15: 0000000000000000
FS:  000055557d481500(0000) GS:ffff88818e328000(0000) knlGS:0000000000000000
CS:  0010 DS: 0000 ES: 0000 CR0: 0000000080050033
CR2: 00007f1f89c706c0 CR3: 00000001742f0000 CR4: 00000000000006f0
Call Trace:
 <TASK>
 bpf_prog_test_run_raw_tp+0x4b9/0x6c0 net/bpf/test_run.c:794
 bpf_prog_test_run+0x2c7/0x340 kernel/bpf/syscall.c:4721
 __sys_bpf+0x643/0x950 kernel/bpf/syscall.c:6246
 __do_sys_bpf kernel/bpf/syscall.c:6341 [inline]
 __se_sys_bpf kernel/bpf/syscall.c:6339 [inline]
 __x64_sys_bpf+0x7c/0x90 kernel/bpf/syscall.c:6339
 do_syscall_x64 arch/x86/entry/syscall_64.c:63 [inline]
 do_syscall_64+0xe2/0xf80 arch/x86/entry/syscall_64.c:94
 entry_SYSCALL_64_after_hwframe+0x77/0x7f
RIP: 0033:0x7f1f89d9acb9
Code: ff c3 66 2e 0f 1f 84 00 00 00 00 00 0f 1f 44 00 00 48 89 f8 48 89 f7 48 89 d6 48 89 ca 4d 89 c2 4d 89 c8 4c 8b 4c 24 08 0f 05 <48> 3d 01 f0 ff ff 73 01 c3 48 c7 c1 e8 ff ff ff f7 d8 64 89 01 48
RSP: 002b:00007fff9e3fd488 EFLAGS: 00000246 ORIG_RAX: 0000000000000141
RAX: ffffffffffffffda RBX: 00007f1f8a015fa0 RCX: 00007f1f89d9acb9
RDX: 000000000000000c RSI: 0000200000000500 RDI: 000000000000000a
RBP: 00007f1f89e08bf7 R08: 0000000000000000 R09: 0000000000000000
R10: 0000000000000000 R11: 0000000000000246 R12: 0000000000000000
R13: 00007f1f8a015fac R14: 00007f1f8a015fa0 R15: 00007f1f8a015fa0
 </TASK>
----------------
Code disassembly (best guess):
   0:	05 cf 8f 48 c1       	add    $0xc1488fcf,%eax
   5:	e8 03 0f b6 04       	call   0x4b60f0d
   a:	18 84 c0 0f 85 88 00 	sbb    %al,0x88850f(%rax,%rax,8)
  11:	00 00                	add    %al,(%rax)
  13:	83 3d ef 21 3d 0e 00 	cmpl   $0x0,0xe3d21ef(%rip)        # 0xe3d2209
  1a:	75 13                	jne    0x2f
  1c:	48 8d 3d b2 f3 3f 0e 	lea    0xe3ff3b2(%rip),%rdi        # 0xe3ff3d5
  23:	48 c7 c6 a0 fe 8b 8b 	mov    $0xffffffff8b8bfea0,%rsi
* 2a:	67 48 0f b9 3a       	ud1    (%edx),%rdi <-- trapping instruction
  2f:	90                   	nop
  30:	eb b8                	jmp    0xffffffea
  32:	90                   	nop
  33:	e8 5d ab 03 03       	call   0x303ab95
  38:	85 c0                	test   %eax,%eax
  3a:	74 2f                	je     0x6b
  3c:	48                   	rex.W
  3d:	c7                   	.byte 0xc7
  3e:	c0                   	.byte 0xc0
  3f:	a4                   	movsb  %ds:(%rsi),%es:(%rdi)


***

If these findings have caused you to resend the series or submit a
separate fix, please add the following tag to your commit message:
  Tested-by: syzbot@syzkaller.appspotmail.com

---
This report is generated by a bot. It may contain errors.
syzbot ci engineers can be reached at syzkaller@googlegroups.com.

^ permalink raw reply	[flat|nested] 9+ messages in thread

* Re: [PATCH bpf-next v2 1/2] bpf: Add preempt disable for bpf_get_stack
  2026-02-06  9:06 [PATCH bpf-next v2 1/2] bpf: Add preempt disable for bpf_get_stack Tao Chen
  2026-02-06  9:06 ` [PATCH bpf-next v2 2/2] bpf: Add preempt disable for bpf_get_stackid Tao Chen
  2026-02-06 14:19 ` [syzbot ci] Re: bpf: Add preempt disable for bpf_get_stack syzbot ci
@ 2026-02-06 17:12 ` Andrii Nakryiko
  2026-02-11  7:10   ` Tao Chen
  2 siblings, 1 reply; 9+ messages in thread
From: Andrii Nakryiko @ 2026-02-06 17:12 UTC (permalink / raw)
  To: Tao Chen
  Cc: song, jolsa, ast, daniel, andrii, martin.lau, eddyz87,
	yonghong.song, john.fastabend, kpsingh, sdf, haoluo, bpf,
	linux-kernel

On Fri, Feb 6, 2026 at 1:07 AM Tao Chen <chen.dylane@linux.dev> wrote:
>
> The get_perf_callchain() return values may be reused if a task is preempted
> after the BPF program enters migrate disable mode, so we should add
> preempt_disable. And as Andrii suggested, BPF can guarantee perf callchain
> buffer won't be released during use, for bpf_get_stack_id, BPF stack map
> will keep them alive by delaying put_callchain_buffer() until freeing time
> or for bpf_get_stack/bpf_get_task_stack, BPF program itself will hold these
> buffers alive again, until freeing time which is delayed until after
> RCU Tasks Trace + RCU grace period.
>
> Suggested-by: Andrii Nakryiko <andrii@kernel.org>
> Signed-off-by: Tao Chen <chen.dylane@linux.dev>
> ---
>
> Change list:
>  - v1 -> v2
>    - add preempt_disable for bpf_get_stack in patch1
>    - add patch2
>  - v1: https://lore.kernel.org/bpf/20260128165710.928294-1-chen.dylane@linux.dev
>
>  kernel/bpf/stackmap.c | 13 ++++++-------
>  1 file changed, 6 insertions(+), 7 deletions(-)
>

Hm... looking at bpf_get_stack_pe(), I'm not sure what's the exact
guarantees around that ctx->data->callchain that we pass as
trace_in... It looks like it's the same temporary per-cpu callchain as
in other places, just attached (temporarily) to ctx. So we probably
want preemption disabled/enabled for that one as well, no? And to
achieve that, I think we'll need to split out build_id logic out of
__bpf_get_stack() and do it after preemption is enabled in the
callers. Luckily it's not that much of a code and logic, should be
easy. But please analyze this carefully yourself.

pw-bot: cr


> diff --git a/kernel/bpf/stackmap.c b/kernel/bpf/stackmap.c
> index da3d328f5c1..1b100a03ef2 100644
> --- a/kernel/bpf/stackmap.c
> +++ b/kernel/bpf/stackmap.c
> @@ -460,8 +460,8 @@ static long __bpf_get_stack(struct pt_regs *regs, struct task_struct *task,
>
>         max_depth = stack_map_calculate_max_depth(size, elem_size, flags);
>
> -       if (may_fault)
> -               rcu_read_lock(); /* need RCU for perf's callchain below */
> +       if (!trace_in)
> +               preempt_disable();
>
>         if (trace_in) {
>                 trace = trace_in;
> @@ -474,8 +474,8 @@ static long __bpf_get_stack(struct pt_regs *regs, struct task_struct *task,
>         }
>
>         if (unlikely(!trace) || trace->nr < skip) {
> -               if (may_fault)
> -                       rcu_read_unlock();
> +               if (!trace_in)
> +                       preempt_enable();
>                 goto err_fault;
>         }
>
> @@ -493,9 +493,8 @@ static long __bpf_get_stack(struct pt_regs *regs, struct task_struct *task,
>                 memcpy(buf, ips, copy_len);
>         }
>
> -       /* trace/ips should not be dereferenced after this point */
> -       if (may_fault)
> -               rcu_read_unlock();
> +       if (!trace_in)
> +               preempt_enable();
>
>         if (user_build_id)
>                 stack_map_get_build_id_offset(buf, trace_nr, user, may_fault);
> --
> 2.48.1
>

^ permalink raw reply	[flat|nested] 9+ messages in thread

* Re: [PATCH bpf-next v2 1/2] bpf: Add preempt disable for bpf_get_stack
  2026-02-06 17:12 ` [PATCH bpf-next v2 1/2] " Andrii Nakryiko
@ 2026-02-11  7:10   ` Tao Chen
  0 siblings, 0 replies; 9+ messages in thread
From: Tao Chen @ 2026-02-11  7:10 UTC (permalink / raw)
  To: Andrii Nakryiko
  Cc: song, jolsa, ast, daniel, andrii, martin.lau, eddyz87,
	yonghong.song, john.fastabend, kpsingh, sdf, haoluo, bpf,
	linux-kernel

在 2026/2/7 01:12, Andrii Nakryiko 写道:
> On Fri, Feb 6, 2026 at 1:07 AM Tao Chen <chen.dylane@linux.dev> wrote:
>>
>> The get_perf_callchain() return values may be reused if a task is preempted
>> after the BPF program enters migrate disable mode, so we should add
>> preempt_disable. And as Andrii suggested, BPF can guarantee perf callchain
>> buffer won't be released during use, for bpf_get_stack_id, BPF stack map
>> will keep them alive by delaying put_callchain_buffer() until freeing time
>> or for bpf_get_stack/bpf_get_task_stack, BPF program itself will hold these
>> buffers alive again, until freeing time which is delayed until after
>> RCU Tasks Trace + RCU grace period.
>>
>> Suggested-by: Andrii Nakryiko <andrii@kernel.org>
>> Signed-off-by: Tao Chen <chen.dylane@linux.dev>
>> ---
>>
>> Change list:
>>   - v1 -> v2
>>     - add preempt_disable for bpf_get_stack in patch1
>>     - add patch2
>>   - v1: https://lore.kernel.org/bpf/20260128165710.928294-1-chen.dylane@linux.dev
>>
>>   kernel/bpf/stackmap.c | 13 ++++++-------
>>   1 file changed, 6 insertions(+), 7 deletions(-)
>>
> 
> Hm... looking at bpf_get_stack_pe(), I'm not sure what's the exact
> guarantees around that ctx->data->callchain that we pass as
> trace_in... It looks like it's the same temporary per-cpu callchain as
> in other places, just attached (temporarily) to ctx. So we probably
> want preemption disabled/enabled for that one as well, no? And to

see commit "1d7bf6b7d3e8" (perf/bpf: Remove preempt disable around BPF 
invocation)
bpf_overflow_handler is called from NMI or at least hard
interrupt context which is already non-preemptible. So no preemption 
disabled needed.

> achieve that, I think we'll need to split out build_id logic out of
> __bpf_get_stack() and do it after preemption is enabled in the
> callers. Luckily it's not that much of a code and logic, should be
> easy. But please analyze this carefully yourself.
> 
> pw-bot: cr
> 
> 
>> diff --git a/kernel/bpf/stackmap.c b/kernel/bpf/stackmap.c
>> index da3d328f5c1..1b100a03ef2 100644
>> --- a/kernel/bpf/stackmap.c
>> +++ b/kernel/bpf/stackmap.c
>> @@ -460,8 +460,8 @@ static long __bpf_get_stack(struct pt_regs *regs, struct task_struct *task,
>>
>>          max_depth = stack_map_calculate_max_depth(size, elem_size, flags);
>>
>> -       if (may_fault)
>> -               rcu_read_lock(); /* need RCU for perf's callchain below */
>> +       if (!trace_in)
>> +               preempt_disable();
>>
>>          if (trace_in) {
>>                  trace = trace_in;
>> @@ -474,8 +474,8 @@ static long __bpf_get_stack(struct pt_regs *regs, struct task_struct *task,
>>          }
>>
>>          if (unlikely(!trace) || trace->nr < skip) {
>> -               if (may_fault)
>> -                       rcu_read_unlock();
>> +               if (!trace_in)
>> +                       preempt_enable();
>>                  goto err_fault;
>>          }
>>
>> @@ -493,9 +493,8 @@ static long __bpf_get_stack(struct pt_regs *regs, struct task_struct *task,
>>                  memcpy(buf, ips, copy_len);
>>          }
>>
>> -       /* trace/ips should not be dereferenced after this point */
>> -       if (may_fault)
>> -               rcu_read_unlock();
>> +       if (!trace_in)
>> +               preempt_enable();
>>
>>          if (user_build_id)
>>                  stack_map_get_build_id_offset(buf, trace_nr, user, may_fault);
>> --
>> 2.48.1
>>


-- 
Best Regards
Tao Chen

^ permalink raw reply	[flat|nested] 9+ messages in thread

end of thread, other threads:[~2026-02-11  7:18 UTC | newest]

Thread overview: 9+ messages (download: mbox.gz follow: Atom feed
-- links below jump to the message on this page --
2026-02-06  9:06 [PATCH bpf-next v2 1/2] bpf: Add preempt disable for bpf_get_stack Tao Chen
2026-02-06  9:06 ` [PATCH bpf-next v2 2/2] bpf: Add preempt disable for bpf_get_stackid Tao Chen
2026-02-06  9:34   ` bot+bpf-ci
2026-02-06  9:58     ` Tao Chen
2026-02-06 17:20   ` Andrii Nakryiko
2026-02-11  7:18     ` Tao Chen
2026-02-06 14:19 ` [syzbot ci] Re: bpf: Add preempt disable for bpf_get_stack syzbot ci
2026-02-06 17:12 ` [PATCH bpf-next v2 1/2] " Andrii Nakryiko
2026-02-11  7:10   ` Tao Chen

This is a public inbox, see mirroring instructions
for how to clone and mirror all data and code used for this inbox