* [PATCH bpf-next v2 1/2] bpf: Add preempt disable for bpf_get_stack
@ 2026-02-06 9:06 Tao Chen
2026-02-06 9:06 ` [PATCH bpf-next v2 2/2] bpf: Add preempt disable for bpf_get_stackid Tao Chen
` (2 more replies)
0 siblings, 3 replies; 9+ messages in thread
From: Tao Chen @ 2026-02-06 9:06 UTC (permalink / raw)
To: song, jolsa, ast, daniel, andrii, martin.lau, eddyz87,
yonghong.song, john.fastabend, kpsingh, sdf, haoluo
Cc: bpf, linux-kernel, Tao Chen
The get_perf_callchain() return values may be reused if a task is preempted
after the BPF program enters migrate disable mode, so we should add
preempt_disable. And as Andrii suggested, BPF can guarantee perf callchain
buffer won't be released during use, for bpf_get_stack_id, BPF stack map
will keep them alive by delaying put_callchain_buffer() until freeing time
or for bpf_get_stack/bpf_get_task_stack, BPF program itself will hold these
buffers alive again, until freeing time which is delayed until after
RCU Tasks Trace + RCU grace period.
Suggested-by: Andrii Nakryiko <andrii@kernel.org>
Signed-off-by: Tao Chen <chen.dylane@linux.dev>
---
Change list:
- v1 -> v2
- add preempt_disable for bpf_get_stack in patch1
- add patch2
- v1: https://lore.kernel.org/bpf/20260128165710.928294-1-chen.dylane@linux.dev
kernel/bpf/stackmap.c | 13 ++++++-------
1 file changed, 6 insertions(+), 7 deletions(-)
diff --git a/kernel/bpf/stackmap.c b/kernel/bpf/stackmap.c
index da3d328f5c1..1b100a03ef2 100644
--- a/kernel/bpf/stackmap.c
+++ b/kernel/bpf/stackmap.c
@@ -460,8 +460,8 @@ static long __bpf_get_stack(struct pt_regs *regs, struct task_struct *task,
max_depth = stack_map_calculate_max_depth(size, elem_size, flags);
- if (may_fault)
- rcu_read_lock(); /* need RCU for perf's callchain below */
+ if (!trace_in)
+ preempt_disable();
if (trace_in) {
trace = trace_in;
@@ -474,8 +474,8 @@ static long __bpf_get_stack(struct pt_regs *regs, struct task_struct *task,
}
if (unlikely(!trace) || trace->nr < skip) {
- if (may_fault)
- rcu_read_unlock();
+ if (!trace_in)
+ preempt_enable();
goto err_fault;
}
@@ -493,9 +493,8 @@ static long __bpf_get_stack(struct pt_regs *regs, struct task_struct *task,
memcpy(buf, ips, copy_len);
}
- /* trace/ips should not be dereferenced after this point */
- if (may_fault)
- rcu_read_unlock();
+ if (!trace_in)
+ preempt_enable();
if (user_build_id)
stack_map_get_build_id_offset(buf, trace_nr, user, may_fault);
--
2.48.1
^ permalink raw reply related [flat|nested] 9+ messages in thread* [PATCH bpf-next v2 2/2] bpf: Add preempt disable for bpf_get_stackid 2026-02-06 9:06 [PATCH bpf-next v2 1/2] bpf: Add preempt disable for bpf_get_stack Tao Chen @ 2026-02-06 9:06 ` Tao Chen 2026-02-06 9:34 ` bot+bpf-ci 2026-02-06 17:20 ` Andrii Nakryiko 2026-02-06 14:19 ` [syzbot ci] Re: bpf: Add preempt disable for bpf_get_stack syzbot ci 2026-02-06 17:12 ` [PATCH bpf-next v2 1/2] " Andrii Nakryiko 2 siblings, 2 replies; 9+ messages in thread From: Tao Chen @ 2026-02-06 9:06 UTC (permalink / raw) To: song, jolsa, ast, daniel, andrii, martin.lau, eddyz87, yonghong.song, john.fastabend, kpsingh, sdf, haoluo Cc: bpf, linux-kernel, Tao Chen The get_perf_callchain() return values may be reused if a task is preempted after the BPF program enters migrate disable mode, so we should add preempt_disable. The get build-id offset in __bpf_get_stackid may increase the length of the preempt disabled section. Luckily, it is safe to enable preempt after perf callchain ips copied to BPF map bucket memory, so we can enable preempt before stack_map_get_build_id_offset. Signed-off-by: Tao Chen <chen.dylane@linux.dev> --- kernel/bpf/stackmap.c | 84 +++++++++++++++++++++++++++---------------- 1 file changed, 53 insertions(+), 31 deletions(-) diff --git a/kernel/bpf/stackmap.c b/kernel/bpf/stackmap.c index 1b100a03ef2..d263f851f08 100644 --- a/kernel/bpf/stackmap.c +++ b/kernel/bpf/stackmap.c @@ -246,33 +246,50 @@ get_callchain_entry_for_task(struct task_struct *task, u32 max_depth) #endif } -static long __bpf_get_stackid(struct bpf_map *map, - struct perf_callchain_entry *trace, u64 flags) +static long __bpf_get_stackid(struct bpf_map *map, struct pt_regs *regs, + struct perf_callchain_entry *trace_in, u64 flags) { struct bpf_stack_map *smap = container_of(map, struct bpf_stack_map, map); struct stack_map_bucket *bucket, *new_bucket, *old_bucket; u32 hash, id, trace_nr, trace_len, i, max_depth; u32 skip = flags & BPF_F_SKIP_FIELD_MASK; bool user = flags & BPF_F_USER_STACK; + bool kernel = !user; + long ret; u64 *ips; bool hash_matches; + struct perf_callchain_entry *trace; + + max_depth = stack_map_calculate_max_depth(map->value_size, stack_map_data_size(map), flags); + if (trace_in) { + trace = trace_in; + } else { + preempt_disable(); + trace = get_perf_callchain(regs, kernel, user, max_depth, false, false, 0); + if (unlikely(!trace)) { + ret = -EFAULT; + goto go_out; + } + } - if (trace->nr <= skip) + if (trace->nr <= skip) { /* skipping more than usable stack trace */ - return -EFAULT; + ret = -EFAULT; + goto go_out; + } - max_depth = stack_map_calculate_max_depth(map->value_size, stack_map_data_size(map), flags); trace_nr = min_t(u32, trace->nr - skip, max_depth - skip); trace_len = trace_nr * sizeof(u64); ips = trace->ip + skip; hash = jhash2((u32 *)ips, trace_len / sizeof(u32), 0); id = hash & (smap->n_buckets - 1); + ret = id; bucket = READ_ONCE(smap->buckets[id]); hash_matches = bucket && bucket->hash == hash; /* fast cmp */ if (hash_matches && flags & BPF_F_FAST_STACK_CMP) - return id; + goto go_out; if (stack_map_use_build_id(map)) { struct bpf_stack_build_id *id_offs; @@ -280,12 +297,22 @@ static long __bpf_get_stackid(struct bpf_map *map, /* for build_id+offset, pop a bucket before slow cmp */ new_bucket = (struct stack_map_bucket *) pcpu_freelist_pop(&smap->freelist); - if (unlikely(!new_bucket)) - return -ENOMEM; + if (unlikely(!new_bucket)) { + ret = -ENOMEM; + goto go_out; + } new_bucket->nr = trace_nr; id_offs = (struct bpf_stack_build_id *)new_bucket->data; for (i = 0; i < trace_nr; i++) id_offs[i].ip = ips[i]; + + /* + * It is safe after perf callchain ips copied to bucket buffer + * to reduce the length of preempt section, we can enable preempt here. + */ + if (!trace_in) + preempt_enable(); + stack_map_get_build_id_offset(id_offs, trace_nr, user, false /* !may_fault */); trace_len = trace_nr * sizeof(struct bpf_stack_build_id); if (hash_matches && bucket->nr == trace_nr && @@ -300,14 +327,19 @@ static long __bpf_get_stackid(struct bpf_map *map, } else { if (hash_matches && bucket->nr == trace_nr && memcmp(bucket->data, ips, trace_len) == 0) - return id; - if (bucket && !(flags & BPF_F_REUSE_STACKID)) - return -EEXIST; + goto go_out; + + if (bucket && !(flags & BPF_F_REUSE_STACKID)) { + ret = -EEXIST; + goto go_out; + } new_bucket = (struct stack_map_bucket *) pcpu_freelist_pop(&smap->freelist); - if (unlikely(!new_bucket)) - return -ENOMEM; + if (unlikely(!new_bucket)) { + ret = -ENOMEM; + goto go_out; + } memcpy(new_bucket->data, ips, trace_len); } @@ -317,31 +349,21 @@ static long __bpf_get_stackid(struct bpf_map *map, old_bucket = xchg(&smap->buckets[id], new_bucket); if (old_bucket) pcpu_freelist_push(&smap->freelist, &old_bucket->fnode); - return id; + +go_out: + if (!trace_in) + preempt_enable(); + return ret; } BPF_CALL_3(bpf_get_stackid, struct pt_regs *, regs, struct bpf_map *, map, u64, flags) { - u32 elem_size = stack_map_data_size(map); - bool user = flags & BPF_F_USER_STACK; - struct perf_callchain_entry *trace; - bool kernel = !user; - u32 max_depth; - if (unlikely(flags & ~(BPF_F_SKIP_FIELD_MASK | BPF_F_USER_STACK | BPF_F_FAST_STACK_CMP | BPF_F_REUSE_STACKID))) return -EINVAL; - max_depth = stack_map_calculate_max_depth(map->value_size, elem_size, flags); - trace = get_perf_callchain(regs, kernel, user, max_depth, - false, false, 0); - - if (unlikely(!trace)) - /* couldn't fetch the stack trace */ - return -EFAULT; - - return __bpf_get_stackid(map, trace, flags); + return __bpf_get_stackid(map, regs, NULL, flags); } const struct bpf_func_proto bpf_get_stackid_proto = { @@ -395,7 +417,7 @@ BPF_CALL_3(bpf_get_stackid_pe, struct bpf_perf_event_data_kern *, ctx, if (kernel) { trace->nr = nr_kernel; - ret = __bpf_get_stackid(map, trace, flags); + ret = __bpf_get_stackid(map, NULL, trace, flags); } else { /* user */ u64 skip = flags & BPF_F_SKIP_FIELD_MASK; @@ -404,7 +426,7 @@ BPF_CALL_3(bpf_get_stackid_pe, struct bpf_perf_event_data_kern *, ctx, return -EFAULT; flags = (flags & ~BPF_F_SKIP_FIELD_MASK) | skip; - ret = __bpf_get_stackid(map, trace, flags); + ret = __bpf_get_stackid(map, NULL, trace, flags); } /* restore nr */ -- 2.48.1 ^ permalink raw reply related [flat|nested] 9+ messages in thread
* Re: [PATCH bpf-next v2 2/2] bpf: Add preempt disable for bpf_get_stackid 2026-02-06 9:06 ` [PATCH bpf-next v2 2/2] bpf: Add preempt disable for bpf_get_stackid Tao Chen @ 2026-02-06 9:34 ` bot+bpf-ci 2026-02-06 9:58 ` Tao Chen 2026-02-06 17:20 ` Andrii Nakryiko 1 sibling, 1 reply; 9+ messages in thread From: bot+bpf-ci @ 2026-02-06 9:34 UTC (permalink / raw) To: chen.dylane, song, jolsa, ast, daniel, andrii, martin.lau, eddyz87, yonghong.song, john.fastabend, kpsingh, sdf, haoluo Cc: bpf, linux-kernel, chen.dylane, ast, andrii, daniel, martin.lau, eddyz87, yonghong.song, clm, ihor.solodrai [-- Attachment #1: Type: text/plain, Size: 3011 bytes --] > diff --git a/kernel/bpf/stackmap.c b/kernel/bpf/stackmap.c > index 1b100a03ef2e..d263f851f084 100644 > --- a/kernel/bpf/stackmap.c > +++ b/kernel/bpf/stackmap.c [ ... ] > @@ -280,12 +297,22 @@ static long __bpf_get_stackid(struct bpf_map *map, > /* for build_id+offset, pop a bucket before slow cmp */ > new_bucket = (struct stack_map_bucket *) > pcpu_freelist_pop(&smap->freelist); > - if (unlikely(!new_bucket)) > - return -ENOMEM; > + if (unlikely(!new_bucket)) { > + ret = -ENOMEM; > + goto go_out; > + } > new_bucket->nr = trace_nr; > id_offs = (struct bpf_stack_build_id *)new_bucket->data; > for (i = 0; i < trace_nr; i++) > id_offs[i].ip = ips[i]; > + > + /* > + * It is safe after perf callchain ips copied to bucket buffer > + * to reduce the length of preempt section, we can enable preempt here. > + */ > + if (!trace_in) > + preempt_enable(); > + When the stack_map_use_build_id() path falls through to insert a new entry (neither the hash_matches early return nor the -EEXIST early return is taken), this preempt_enable() is called here, and then the code continues to go_out where preempt_enable() is called again. The sequence when trace_in == NULL: preempt_disable() // entering __bpf_get_stackid ... if (!trace_in) preempt_enable(); // here, after copying IPs ... // fall through to insert new entry new_bucket->hash = hash; ... xchg(&smap->buckets[id], new_bucket); ... go_out: if (!trace_in) preempt_enable(); // second enable - imbalanced The two direct returns (return id and return -EEXIST) after the early preempt_enable() correctly bypass go_out. However, when inserting a new stack trace (the normal case when no exact match exists and the bucket can be reused), the code falls through to go_out and calls preempt_enable() a second time. Could this cause preempt_count underflow when using BPF_F_STACK_BUILD_ID maps with bpf_get_stackid()? > @@ -317,31 +349,21 @@ static long __bpf_get_stackid(struct bpf_map *map, > old_bucket = xchg(&smap->buckets[id], new_bucket); > if (old_bucket) > pcpu_freelist_push(&smap->freelist, &old_bucket->fnode); > - return id; > + > +go_out: > + if (!trace_in) > + preempt_enable(); > + return ret; > } --- AI reviewed your patch. Please fix the bug or email reply why it's not a bug. See: https://github.com/kernel-patches/vmtest/blob/master/ci/claude/README.md CI run summary: https://github.com/kernel-patches/bpf/actions/runs/21745231308 AI-authorship-score: low AI-authorship-explanation: The commit shows typical human patterns including a control-flow bug from manual refactoring that AI would likely catch, combined with natural but slightly awkward phrasing in comments. issues-found: 1 issue-severity-score: high issue-severity-explanation: Preempt count underflow in the build_id path causes kernel warnings and potential system instability when inserting new stack traces. ^ permalink raw reply [flat|nested] 9+ messages in thread
* Re: [PATCH bpf-next v2 2/2] bpf: Add preempt disable for bpf_get_stackid 2026-02-06 9:34 ` bot+bpf-ci @ 2026-02-06 9:58 ` Tao Chen 0 siblings, 0 replies; 9+ messages in thread From: Tao Chen @ 2026-02-06 9:58 UTC (permalink / raw) To: bot+bpf-ci, song, jolsa, ast, daniel, andrii, martin.lau, eddyz87, yonghong.song, john.fastabend, kpsingh, sdf, haoluo Cc: bpf, linux-kernel, martin.lau, clm, ihor.solodrai 在 2026/2/6 17:34, bot+bpf-ci@kernel.org 写道: >> diff --git a/kernel/bpf/stackmap.c b/kernel/bpf/stackmap.c >> index 1b100a03ef2e..d263f851f084 100644 >> --- a/kernel/bpf/stackmap.c >> +++ b/kernel/bpf/stackmap.c > > [ ... ] > >> @@ -280,12 +297,22 @@ static long __bpf_get_stackid(struct bpf_map *map, >> /* for build_id+offset, pop a bucket before slow cmp */ >> new_bucket = (struct stack_map_bucket *) >> pcpu_freelist_pop(&smap->freelist); >> - if (unlikely(!new_bucket)) >> - return -ENOMEM; >> + if (unlikely(!new_bucket)) { >> + ret = -ENOMEM; >> + goto go_out; >> + } >> new_bucket->nr = trace_nr; >> id_offs = (struct bpf_stack_build_id *)new_bucket->data; >> for (i = 0; i < trace_nr; i++) >> id_offs[i].ip = ips[i]; >> + >> + /* >> + * It is safe after perf callchain ips copied to bucket buffer >> + * to reduce the length of preempt section, we can enable preempt here. >> + */ >> + if (!trace_in) >> + preempt_enable(); >> + > > When the stack_map_use_build_id() path falls through to insert a new entry > (neither the hash_matches early return nor the -EEXIST early return is > taken), this preempt_enable() is called here, and then the code continues > to go_out where preempt_enable() is called again. > > The sequence when trace_in == NULL: > > preempt_disable() // entering __bpf_get_stackid > ... > if (!trace_in) > preempt_enable(); // here, after copying IPs > ... > // fall through to insert new entry > new_bucket->hash = hash; > ... > xchg(&smap->buckets[id], new_bucket); > ... > go_out: > if (!trace_in) > preempt_enable(); // second enable - imbalanced > > The two direct returns (return id and return -EEXIST) after the early > preempt_enable() correctly bypass go_out. However, when inserting a new > stack trace (the normal case when no exact match exists and the bucket > can be reused), the code falls through to go_out and calls preempt_enable() > a second time. > will fix it in v3, and wait for other comments from matainers. > Could this cause preempt_count underflow when using BPF_F_STACK_BUILD_ID > maps with bpf_get_stackid()? > >> @@ -317,31 +349,21 @@ static long __bpf_get_stackid(struct bpf_map *map, >> old_bucket = xchg(&smap->buckets[id], new_bucket); >> if (old_bucket) >> pcpu_freelist_push(&smap->freelist, &old_bucket->fnode); >> - return id; >> + >> +go_out: >> + if (!trace_in) >> + preempt_enable(); >> + return ret; >> } > > > --- > AI reviewed your patch. Please fix the bug or email reply why it's not a bug. > See: https://github.com/kernel-patches/vmtest/blob/master/ci/claude/README.md > > CI run summary: https://github.com/kernel-patches/bpf/actions/runs/21745231308 > > AI-authorship-score: low > AI-authorship-explanation: The commit shows typical human patterns including a control-flow bug from manual refactoring that AI would likely catch, combined with natural but slightly awkward phrasing in comments. > issues-found: 1 > issue-severity-score: high > issue-severity-explanation: Preempt count underflow in the build_id path causes kernel warnings and potential system instability when inserting new stack traces. -- Best Regards Tao Chen ^ permalink raw reply [flat|nested] 9+ messages in thread
* Re: [PATCH bpf-next v2 2/2] bpf: Add preempt disable for bpf_get_stackid 2026-02-06 9:06 ` [PATCH bpf-next v2 2/2] bpf: Add preempt disable for bpf_get_stackid Tao Chen 2026-02-06 9:34 ` bot+bpf-ci @ 2026-02-06 17:20 ` Andrii Nakryiko 2026-02-11 7:18 ` Tao Chen 1 sibling, 1 reply; 9+ messages in thread From: Andrii Nakryiko @ 2026-02-06 17:20 UTC (permalink / raw) To: Tao Chen Cc: song, jolsa, ast, daniel, andrii, martin.lau, eddyz87, yonghong.song, john.fastabend, kpsingh, sdf, haoluo, bpf, linux-kernel On Fri, Feb 6, 2026 at 1:07 AM Tao Chen <chen.dylane@linux.dev> wrote: > > The get_perf_callchain() return values may be reused if a task is preempted > after the BPF program enters migrate disable mode, so we should add > preempt_disable. > The get build-id offset in __bpf_get_stackid may increase the length > of the preempt disabled section. Luckily, it is safe to enable preempt > after perf callchain ips copied to BPF map bucket memory, so we can enable > preempt before stack_map_get_build_id_offset. > > Signed-off-by: Tao Chen <chen.dylane@linux.dev> > --- > kernel/bpf/stackmap.c | 84 +++++++++++++++++++++++++++---------------- > 1 file changed, 53 insertions(+), 31 deletions(-) > Let's take a step back and look at the bigger picture. This whole trace_in thing is still problematic because one way or another we do get perf_callchain_entry, which *needs* preemption disable. So we need to refactor __bpf_get_stackid() in such a way that we can do all these steps separately: a) get stackmap entry (pcpu_freelist_pop parts) b) (under disabled preemption) get temporary per-cpu callchain_entry and copy its IPs into stackmap entry c) (now with preemption enabled) perform build ID fetch and transformation Current logic is too coupled together to allow this, but conceptually there is nothing preventing us from breaking __bpf_get_stackid() (and see my replies in previous email, we should do similar breakup for __bpf_get_stack) into few separate steps. We have to do this to make all of this work in all the combinations of sleepable/non-sleepable and with/without build id, IMO. Take your time, don't rush, think this through. This is a bit of a maze of code, we should untangle it properly, not hack our way through it. [...] ^ permalink raw reply [flat|nested] 9+ messages in thread
* Re: [PATCH bpf-next v2 2/2] bpf: Add preempt disable for bpf_get_stackid 2026-02-06 17:20 ` Andrii Nakryiko @ 2026-02-11 7:18 ` Tao Chen 0 siblings, 0 replies; 9+ messages in thread From: Tao Chen @ 2026-02-11 7:18 UTC (permalink / raw) To: Andrii Nakryiko Cc: song, jolsa, ast, daniel, andrii, martin.lau, eddyz87, yonghong.song, john.fastabend, kpsingh, sdf, haoluo, bpf, linux-kernel 在 2026/2/7 01:20, Andrii Nakryiko 写道: > On Fri, Feb 6, 2026 at 1:07 AM Tao Chen <chen.dylane@linux.dev> wrote: >> >> The get_perf_callchain() return values may be reused if a task is preempted >> after the BPF program enters migrate disable mode, so we should add >> preempt_disable. >> The get build-id offset in __bpf_get_stackid may increase the length >> of the preempt disabled section. Luckily, it is safe to enable preempt >> after perf callchain ips copied to BPF map bucket memory, so we can enable >> preempt before stack_map_get_build_id_offset. >> >> Signed-off-by: Tao Chen <chen.dylane@linux.dev> >> --- >> kernel/bpf/stackmap.c | 84 +++++++++++++++++++++++++++---------------- >> 1 file changed, 53 insertions(+), 31 deletions(-) >> > > Let's take a step back and look at the bigger picture. This whole > trace_in thing is still problematic because one way or another we do > get perf_callchain_entry, which *needs* preemption disable. So we need > to refactor __bpf_get_stackid() in such a way that we can do all these > steps separately: > > a) get stackmap entry (pcpu_freelist_pop parts) > b) (under disabled preemption) get temporary per-cpu callchain_entry > and copy its IPs into stackmap entry > c) (now with preemption enabled) perform build ID fetch and transformation > Ok, I've tried to follow this approach to make the code clearer. Thanks. > Current logic is too coupled together to allow this, but conceptually > there is nothing preventing us from breaking __bpf_get_stackid() (and > see my replies in previous email, we should do similar breakup for > __bpf_get_stack) into few separate steps. > > We have to do this to make all of this work in all the combinations of > sleepable/non-sleepable and with/without build id, IMO. > > Take your time, don't rush, think this through. This is a bit of a > maze of code, we should untangle it properly, not hack our way through > it. > > [...] -- Best Regards Tao Chen ^ permalink raw reply [flat|nested] 9+ messages in thread
* [syzbot ci] Re: bpf: Add preempt disable for bpf_get_stack 2026-02-06 9:06 [PATCH bpf-next v2 1/2] bpf: Add preempt disable for bpf_get_stack Tao Chen 2026-02-06 9:06 ` [PATCH bpf-next v2 2/2] bpf: Add preempt disable for bpf_get_stackid Tao Chen @ 2026-02-06 14:19 ` syzbot ci 2026-02-06 17:12 ` [PATCH bpf-next v2 1/2] " Andrii Nakryiko 2 siblings, 0 replies; 9+ messages in thread From: syzbot ci @ 2026-02-06 14:19 UTC (permalink / raw) To: andrii, ast, bpf, chen.dylane, daniel, eddyz87, haoluo, john.fastabend, jolsa, kpsingh, linux-kernel, martin.lau, sdf, song, yonghong.song Cc: syzbot, syzkaller-bugs syzbot ci has tested the following series [v2] bpf: Add preempt disable for bpf_get_stack https://lore.kernel.org/all/20260206090653.1336687-1-chen.dylane@linux.dev * [PATCH bpf-next v2 1/2] bpf: Add preempt disable for bpf_get_stack * [PATCH bpf-next v2 2/2] bpf: Add preempt disable for bpf_get_stackid and found the following issue: WARNING in preempt_count_sub Full report is available here: https://ci.syzbot.org/series/90d08df2-d19d-404f-b9dd-c201605b83a2 *** WARNING in preempt_count_sub tree: bpf-next URL: https://kernel.googlesource.com/pub/scm/linux/kernel/git/bpf/bpf-next.git base: 1ace9bac1ad2bc6a0a70baaa16d22b7e783e88c5 arch: amd64 compiler: Debian clang version 21.1.8 (++20251221033036+2078da43e25a-1~exp1~20251221153213.50), Debian LLD 21.1.8 config: https://ci.syzbot.org/builds/64881591-241a-4854-9beb-1f691566835c/config C repro: https://ci.syzbot.org/findings/98f347c8-b879-4c85-829b-a20ef900e645/c_repro syz repro: https://ci.syzbot.org/findings/98f347c8-b879-4c85-829b-a20ef900e645/syz_repro ------------[ cut here ]------------ DEBUG_LOCKS_WARN_ON(val > preempt_count()) WARNING: kernel/sched/core.c:5751 at preempt_count_sub+0x9e/0x170 kernel/sched/core.c:5751, CPU#0: syz.0.17/5991 Modules linked in: CPU: 0 UID: 0 PID: 5991 Comm: syz.0.17 Not tainted syzkaller #0 PREEMPT(full) Hardware name: QEMU Standard PC (Q35 + ICH9, 2009), BIOS 1.16.2-debian-1.16.2-1 04/01/2014 RIP: 0010:preempt_count_sub+0xa5/0x170 kernel/sched/core.c:5751 Code: 05 cf 8f 48 c1 e8 03 0f b6 04 18 84 c0 0f 85 88 00 00 00 83 3d ef 21 3d 0e 00 75 13 48 8d 3d b2 f3 3f 0e 48 c7 c6 a0 fe 8b 8b <67> 48 0f b9 3a 90 eb b8 90 e8 5d ab 03 03 85 c0 74 2f 48 c7 c0 a4 RSP: 0018:ffffc90004437bc8 EFLAGS: 00010246 RAX: 0000000000000000 RBX: dffffc0000000000 RCX: ffff8881746f8000 RDX: 0000000000000000 RSI: ffffffff8b8bfea0 RDI: ffffffff8fd1d770 RBP: ffffc90004437cd0 R08: ffffffff8fcf05a3 R09: 1ffffffff1f9e0b4 R10: dffffc0000000000 R11: fffffbfff1f9e0b5 R12: dffffc0000000000 R13: 1ffff92000886f84 R14: 0000000000000000 R15: 0000000000000000 FS: 000055557d481500(0000) GS:ffff88818e328000(0000) knlGS:0000000000000000 CS: 0010 DS: 0000 ES: 0000 CR0: 0000000080050033 CR2: 00007f1f89c706c0 CR3: 00000001742f0000 CR4: 00000000000006f0 Call Trace: <TASK> bpf_prog_test_run_raw_tp+0x4b9/0x6c0 net/bpf/test_run.c:794 bpf_prog_test_run+0x2c7/0x340 kernel/bpf/syscall.c:4721 __sys_bpf+0x643/0x950 kernel/bpf/syscall.c:6246 __do_sys_bpf kernel/bpf/syscall.c:6341 [inline] __se_sys_bpf kernel/bpf/syscall.c:6339 [inline] __x64_sys_bpf+0x7c/0x90 kernel/bpf/syscall.c:6339 do_syscall_x64 arch/x86/entry/syscall_64.c:63 [inline] do_syscall_64+0xe2/0xf80 arch/x86/entry/syscall_64.c:94 entry_SYSCALL_64_after_hwframe+0x77/0x7f RIP: 0033:0x7f1f89d9acb9 Code: ff c3 66 2e 0f 1f 84 00 00 00 00 00 0f 1f 44 00 00 48 89 f8 48 89 f7 48 89 d6 48 89 ca 4d 89 c2 4d 89 c8 4c 8b 4c 24 08 0f 05 <48> 3d 01 f0 ff ff 73 01 c3 48 c7 c1 e8 ff ff ff f7 d8 64 89 01 48 RSP: 002b:00007fff9e3fd488 EFLAGS: 00000246 ORIG_RAX: 0000000000000141 RAX: ffffffffffffffda RBX: 00007f1f8a015fa0 RCX: 00007f1f89d9acb9 RDX: 000000000000000c RSI: 0000200000000500 RDI: 000000000000000a RBP: 00007f1f89e08bf7 R08: 0000000000000000 R09: 0000000000000000 R10: 0000000000000000 R11: 0000000000000246 R12: 0000000000000000 R13: 00007f1f8a015fac R14: 00007f1f8a015fa0 R15: 00007f1f8a015fa0 </TASK> ---------------- Code disassembly (best guess): 0: 05 cf 8f 48 c1 add $0xc1488fcf,%eax 5: e8 03 0f b6 04 call 0x4b60f0d a: 18 84 c0 0f 85 88 00 sbb %al,0x88850f(%rax,%rax,8) 11: 00 00 add %al,(%rax) 13: 83 3d ef 21 3d 0e 00 cmpl $0x0,0xe3d21ef(%rip) # 0xe3d2209 1a: 75 13 jne 0x2f 1c: 48 8d 3d b2 f3 3f 0e lea 0xe3ff3b2(%rip),%rdi # 0xe3ff3d5 23: 48 c7 c6 a0 fe 8b 8b mov $0xffffffff8b8bfea0,%rsi * 2a: 67 48 0f b9 3a ud1 (%edx),%rdi <-- trapping instruction 2f: 90 nop 30: eb b8 jmp 0xffffffea 32: 90 nop 33: e8 5d ab 03 03 call 0x303ab95 38: 85 c0 test %eax,%eax 3a: 74 2f je 0x6b 3c: 48 rex.W 3d: c7 .byte 0xc7 3e: c0 .byte 0xc0 3f: a4 movsb %ds:(%rsi),%es:(%rdi) *** If these findings have caused you to resend the series or submit a separate fix, please add the following tag to your commit message: Tested-by: syzbot@syzkaller.appspotmail.com --- This report is generated by a bot. It may contain errors. syzbot ci engineers can be reached at syzkaller@googlegroups.com. ^ permalink raw reply [flat|nested] 9+ messages in thread
* Re: [PATCH bpf-next v2 1/2] bpf: Add preempt disable for bpf_get_stack 2026-02-06 9:06 [PATCH bpf-next v2 1/2] bpf: Add preempt disable for bpf_get_stack Tao Chen 2026-02-06 9:06 ` [PATCH bpf-next v2 2/2] bpf: Add preempt disable for bpf_get_stackid Tao Chen 2026-02-06 14:19 ` [syzbot ci] Re: bpf: Add preempt disable for bpf_get_stack syzbot ci @ 2026-02-06 17:12 ` Andrii Nakryiko 2026-02-11 7:10 ` Tao Chen 2 siblings, 1 reply; 9+ messages in thread From: Andrii Nakryiko @ 2026-02-06 17:12 UTC (permalink / raw) To: Tao Chen Cc: song, jolsa, ast, daniel, andrii, martin.lau, eddyz87, yonghong.song, john.fastabend, kpsingh, sdf, haoluo, bpf, linux-kernel On Fri, Feb 6, 2026 at 1:07 AM Tao Chen <chen.dylane@linux.dev> wrote: > > The get_perf_callchain() return values may be reused if a task is preempted > after the BPF program enters migrate disable mode, so we should add > preempt_disable. And as Andrii suggested, BPF can guarantee perf callchain > buffer won't be released during use, for bpf_get_stack_id, BPF stack map > will keep them alive by delaying put_callchain_buffer() until freeing time > or for bpf_get_stack/bpf_get_task_stack, BPF program itself will hold these > buffers alive again, until freeing time which is delayed until after > RCU Tasks Trace + RCU grace period. > > Suggested-by: Andrii Nakryiko <andrii@kernel.org> > Signed-off-by: Tao Chen <chen.dylane@linux.dev> > --- > > Change list: > - v1 -> v2 > - add preempt_disable for bpf_get_stack in patch1 > - add patch2 > - v1: https://lore.kernel.org/bpf/20260128165710.928294-1-chen.dylane@linux.dev > > kernel/bpf/stackmap.c | 13 ++++++------- > 1 file changed, 6 insertions(+), 7 deletions(-) > Hm... looking at bpf_get_stack_pe(), I'm not sure what's the exact guarantees around that ctx->data->callchain that we pass as trace_in... It looks like it's the same temporary per-cpu callchain as in other places, just attached (temporarily) to ctx. So we probably want preemption disabled/enabled for that one as well, no? And to achieve that, I think we'll need to split out build_id logic out of __bpf_get_stack() and do it after preemption is enabled in the callers. Luckily it's not that much of a code and logic, should be easy. But please analyze this carefully yourself. pw-bot: cr > diff --git a/kernel/bpf/stackmap.c b/kernel/bpf/stackmap.c > index da3d328f5c1..1b100a03ef2 100644 > --- a/kernel/bpf/stackmap.c > +++ b/kernel/bpf/stackmap.c > @@ -460,8 +460,8 @@ static long __bpf_get_stack(struct pt_regs *regs, struct task_struct *task, > > max_depth = stack_map_calculate_max_depth(size, elem_size, flags); > > - if (may_fault) > - rcu_read_lock(); /* need RCU for perf's callchain below */ > + if (!trace_in) > + preempt_disable(); > > if (trace_in) { > trace = trace_in; > @@ -474,8 +474,8 @@ static long __bpf_get_stack(struct pt_regs *regs, struct task_struct *task, > } > > if (unlikely(!trace) || trace->nr < skip) { > - if (may_fault) > - rcu_read_unlock(); > + if (!trace_in) > + preempt_enable(); > goto err_fault; > } > > @@ -493,9 +493,8 @@ static long __bpf_get_stack(struct pt_regs *regs, struct task_struct *task, > memcpy(buf, ips, copy_len); > } > > - /* trace/ips should not be dereferenced after this point */ > - if (may_fault) > - rcu_read_unlock(); > + if (!trace_in) > + preempt_enable(); > > if (user_build_id) > stack_map_get_build_id_offset(buf, trace_nr, user, may_fault); > -- > 2.48.1 > ^ permalink raw reply [flat|nested] 9+ messages in thread
* Re: [PATCH bpf-next v2 1/2] bpf: Add preempt disable for bpf_get_stack 2026-02-06 17:12 ` [PATCH bpf-next v2 1/2] " Andrii Nakryiko @ 2026-02-11 7:10 ` Tao Chen 0 siblings, 0 replies; 9+ messages in thread From: Tao Chen @ 2026-02-11 7:10 UTC (permalink / raw) To: Andrii Nakryiko Cc: song, jolsa, ast, daniel, andrii, martin.lau, eddyz87, yonghong.song, john.fastabend, kpsingh, sdf, haoluo, bpf, linux-kernel 在 2026/2/7 01:12, Andrii Nakryiko 写道: > On Fri, Feb 6, 2026 at 1:07 AM Tao Chen <chen.dylane@linux.dev> wrote: >> >> The get_perf_callchain() return values may be reused if a task is preempted >> after the BPF program enters migrate disable mode, so we should add >> preempt_disable. And as Andrii suggested, BPF can guarantee perf callchain >> buffer won't be released during use, for bpf_get_stack_id, BPF stack map >> will keep them alive by delaying put_callchain_buffer() until freeing time >> or for bpf_get_stack/bpf_get_task_stack, BPF program itself will hold these >> buffers alive again, until freeing time which is delayed until after >> RCU Tasks Trace + RCU grace period. >> >> Suggested-by: Andrii Nakryiko <andrii@kernel.org> >> Signed-off-by: Tao Chen <chen.dylane@linux.dev> >> --- >> >> Change list: >> - v1 -> v2 >> - add preempt_disable for bpf_get_stack in patch1 >> - add patch2 >> - v1: https://lore.kernel.org/bpf/20260128165710.928294-1-chen.dylane@linux.dev >> >> kernel/bpf/stackmap.c | 13 ++++++------- >> 1 file changed, 6 insertions(+), 7 deletions(-) >> > > Hm... looking at bpf_get_stack_pe(), I'm not sure what's the exact > guarantees around that ctx->data->callchain that we pass as > trace_in... It looks like it's the same temporary per-cpu callchain as > in other places, just attached (temporarily) to ctx. So we probably > want preemption disabled/enabled for that one as well, no? And to see commit "1d7bf6b7d3e8" (perf/bpf: Remove preempt disable around BPF invocation) bpf_overflow_handler is called from NMI or at least hard interrupt context which is already non-preemptible. So no preemption disabled needed. > achieve that, I think we'll need to split out build_id logic out of > __bpf_get_stack() and do it after preemption is enabled in the > callers. Luckily it's not that much of a code and logic, should be > easy. But please analyze this carefully yourself. > > pw-bot: cr > > >> diff --git a/kernel/bpf/stackmap.c b/kernel/bpf/stackmap.c >> index da3d328f5c1..1b100a03ef2 100644 >> --- a/kernel/bpf/stackmap.c >> +++ b/kernel/bpf/stackmap.c >> @@ -460,8 +460,8 @@ static long __bpf_get_stack(struct pt_regs *regs, struct task_struct *task, >> >> max_depth = stack_map_calculate_max_depth(size, elem_size, flags); >> >> - if (may_fault) >> - rcu_read_lock(); /* need RCU for perf's callchain below */ >> + if (!trace_in) >> + preempt_disable(); >> >> if (trace_in) { >> trace = trace_in; >> @@ -474,8 +474,8 @@ static long __bpf_get_stack(struct pt_regs *regs, struct task_struct *task, >> } >> >> if (unlikely(!trace) || trace->nr < skip) { >> - if (may_fault) >> - rcu_read_unlock(); >> + if (!trace_in) >> + preempt_enable(); >> goto err_fault; >> } >> >> @@ -493,9 +493,8 @@ static long __bpf_get_stack(struct pt_regs *regs, struct task_struct *task, >> memcpy(buf, ips, copy_len); >> } >> >> - /* trace/ips should not be dereferenced after this point */ >> - if (may_fault) >> - rcu_read_unlock(); >> + if (!trace_in) >> + preempt_enable(); >> >> if (user_build_id) >> stack_map_get_build_id_offset(buf, trace_nr, user, may_fault); >> -- >> 2.48.1 >> -- Best Regards Tao Chen ^ permalink raw reply [flat|nested] 9+ messages in thread
end of thread, other threads:[~2026-02-11 7:18 UTC | newest] Thread overview: 9+ messages (download: mbox.gz follow: Atom feed -- links below jump to the message on this page -- 2026-02-06 9:06 [PATCH bpf-next v2 1/2] bpf: Add preempt disable for bpf_get_stack Tao Chen 2026-02-06 9:06 ` [PATCH bpf-next v2 2/2] bpf: Add preempt disable for bpf_get_stackid Tao Chen 2026-02-06 9:34 ` bot+bpf-ci 2026-02-06 9:58 ` Tao Chen 2026-02-06 17:20 ` Andrii Nakryiko 2026-02-11 7:18 ` Tao Chen 2026-02-06 14:19 ` [syzbot ci] Re: bpf: Add preempt disable for bpf_get_stack syzbot ci 2026-02-06 17:12 ` [PATCH bpf-next v2 1/2] " Andrii Nakryiko 2026-02-11 7:10 ` Tao Chen
This is a public inbox, see mirroring instructions for how to clone and mirror all data and code used for this inbox