* [PATCH bpf v3 1/3] bpf: Factor out stack_map_build_id_set_ip() in stackmap.c
2026-05-12 3:29 [PATCH bpf v3 0/3] bpf: Implement stack_map_get_build_id_offset_sleepable() Ihor Solodrai
@ 2026-05-12 3:29 ` Ihor Solodrai
2026-05-12 3:29 ` [PATCH bpf v3 2/3] bpf: Avoid faultable build ID reads under mm locks Ihor Solodrai
2026-05-12 3:29 ` [PATCH bpf v3 3/3] bpf: Cache build IDs in sleepable stackmap path Ihor Solodrai
2 siblings, 0 replies; 5+ messages in thread
From: Ihor Solodrai @ 2026-05-12 3:29 UTC (permalink / raw)
To: Alexei Starovoitov, Andrii Nakryiko, Daniel Borkmann,
Eduard Zingerman, Kumar Kartikeya Dwivedi
Cc: Puranjay Mohan, Shakeel Butt, Mykyta Yatsenko, bpf, linux-kernel,
kernel-team
Factor out a small helper from stack_map_get_build_id_offset() in
preparation for adding a sleepable build ID resolution path.
No functional changes.
Acked-by: Mykyta Yatsenko <yatsenko@meta.com>
Signed-off-by: Ihor Solodrai <ihor.solodrai@linux.dev>
---
kernel/bpf/stackmap.c | 21 ++++++++++++---------
1 file changed, 12 insertions(+), 9 deletions(-)
diff --git a/kernel/bpf/stackmap.c b/kernel/bpf/stackmap.c
index da3d328f5c15..4ef0fd06cea5 100644
--- a/kernel/bpf/stackmap.c
+++ b/kernel/bpf/stackmap.c
@@ -152,6 +152,12 @@ static int fetch_build_id(struct vm_area_struct *vma, unsigned char *build_id, b
: build_id_parse_nofault(vma, build_id, NULL);
}
+static inline void stack_map_build_id_set_ip(struct bpf_stack_build_id *id)
+{
+ id->status = BPF_STACK_BUILD_ID_IP;
+ memset(id->build_id, 0, BUILD_ID_SIZE_MAX);
+}
+
/*
* Expects all id_offs[i].ip values to be set to correct initial IPs.
* They will be subsequently:
@@ -165,23 +171,21 @@ static int fetch_build_id(struct vm_area_struct *vma, unsigned char *build_id, b
static void stack_map_get_build_id_offset(struct bpf_stack_build_id *id_offs,
u32 trace_nr, bool user, bool may_fault)
{
- int i;
struct mmap_unlock_irq_work *work = NULL;
bool irq_work_busy = bpf_mmap_unlock_get_irq_work(&work);
+ bool has_user_ctx = user && current && current->mm;
struct vm_area_struct *vma, *prev_vma = NULL;
const char *prev_build_id;
+ int i;
/* If the irq_work is in use, fall back to report ips. Same
* fallback is used for kernel stack (!user) on a stackmap with
* build_id.
*/
- if (!user || !current || !current->mm || irq_work_busy ||
- !mmap_read_trylock(current->mm)) {
+ if (!has_user_ctx || irq_work_busy || !mmap_read_trylock(current->mm)) {
/* cannot access current->mm, fall back to ips */
- for (i = 0; i < trace_nr; i++) {
- id_offs[i].status = BPF_STACK_BUILD_ID_IP;
- memset(id_offs[i].build_id, 0, BUILD_ID_SIZE_MAX);
- }
+ for (i = 0; i < trace_nr; i++)
+ stack_map_build_id_set_ip(&id_offs[i]);
return;
}
@@ -196,8 +200,7 @@ static void stack_map_get_build_id_offset(struct bpf_stack_build_id *id_offs,
vma = find_vma(current->mm, ip);
if (!vma || fetch_build_id(vma, id_offs[i].build_id, may_fault)) {
/* per entry fall back to ips */
- id_offs[i].status = BPF_STACK_BUILD_ID_IP;
- memset(id_offs[i].build_id, 0, BUILD_ID_SIZE_MAX);
+ stack_map_build_id_set_ip(&id_offs[i]);
continue;
}
build_id_valid:
--
2.54.0
^ permalink raw reply related [flat|nested] 5+ messages in thread* [PATCH bpf v3 2/3] bpf: Avoid faultable build ID reads under mm locks
2026-05-12 3:29 [PATCH bpf v3 0/3] bpf: Implement stack_map_get_build_id_offset_sleepable() Ihor Solodrai
2026-05-12 3:29 ` [PATCH bpf v3 1/3] bpf: Factor out stack_map_build_id_set_ip() in stackmap.c Ihor Solodrai
@ 2026-05-12 3:29 ` Ihor Solodrai
2026-05-12 4:16 ` bot+bpf-ci
2026-05-12 3:29 ` [PATCH bpf v3 3/3] bpf: Cache build IDs in sleepable stackmap path Ihor Solodrai
2 siblings, 1 reply; 5+ messages in thread
From: Ihor Solodrai @ 2026-05-12 3:29 UTC (permalink / raw)
To: Alexei Starovoitov, Andrii Nakryiko, Daniel Borkmann,
Eduard Zingerman, Kumar Kartikeya Dwivedi
Cc: Puranjay Mohan, Shakeel Butt, Mykyta Yatsenko, bpf, linux-kernel,
kernel-team
Sleepable build ID parsing can block in __kernel_read() [1], so the
stackmap sleepable path must not call it while holding mmap_lock or a
per-VMA read lock.
The issue and the fix are conceptually similar to a recent procfs
patch [2].
Resolve each covered VMA with a stable read-side reference, preferring
lock_vma_under_rcu() and falling back to mmap_read_trylock() only long
enough to acquire the VMA read lock. Take a reference to the backing
file, drop the VMA lock, and then parse the build ID through
(sleepable) build_id_parse_file().
[1]: https://lore.kernel.org/all/20251218005818.614819-1-shakeel.butt@linux.dev/
[2]: https://lore.kernel.org/all/20260128183232.2854138-1-andrii@kernel.org/
Fixes: 777a8560fd29 ("lib/buildid: use __kernel_read() for sleepable context")
Assisted-by: Codex:gpt-5.4
Suggested-by: Puranjay Mohan <puranjay@kernel.org>
Signed-off-by: Ihor Solodrai <ihor.solodrai@linux.dev>
---
kernel/bpf/stackmap.c | 105 ++++++++++++++++++++++++++++++++++++++++++
1 file changed, 105 insertions(+)
diff --git a/kernel/bpf/stackmap.c b/kernel/bpf/stackmap.c
index 4ef0fd06cea5..c1e96df360c3 100644
--- a/kernel/bpf/stackmap.c
+++ b/kernel/bpf/stackmap.c
@@ -9,6 +9,7 @@
#include <linux/perf_event.h>
#include <linux/btf_ids.h>
#include <linux/buildid.h>
+#include <linux/mmap_lock.h>
#include "percpu_freelist.h"
#include "mmap_unlock_work.h"
@@ -158,6 +159,105 @@ static inline void stack_map_build_id_set_ip(struct bpf_stack_build_id *id)
memset(id->build_id, 0, BUILD_ID_SIZE_MAX);
}
+struct stack_map_vma_lock {
+ bool vma_locked;
+ struct vm_area_struct *vma;
+ struct mm_struct *mm;
+};
+
+static struct vm_area_struct *
+stack_map_lock_vma(struct stack_map_vma_lock *lock, unsigned long ip)
+{
+ struct mm_struct *mm = lock->mm;
+ struct vm_area_struct *vma;
+
+ if (WARN_ON_ONCE(!mm))
+ return NULL;
+
+ vma = lock_vma_under_rcu(mm, ip);
+ if (vma)
+ goto vma_locked;
+
+ if (!mmap_read_trylock(mm))
+ return NULL;
+
+ vma = vma_lookup(mm, ip);
+ if (!vma) {
+ mmap_read_unlock(mm);
+ return NULL;
+ }
+
+#ifdef CONFIG_PER_VMA_LOCK
+ if (!vma_start_read_locked(vma)) {
+ mmap_read_unlock(mm);
+ return NULL;
+ }
+ mmap_read_unlock(mm);
+#else
+ mmap_read_unlock(mm);
+ return NULL;
+#endif
+vma_locked:
+ lock->vma_locked = true;
+ lock->vma = vma;
+ return vma;
+}
+
+static void stack_map_unlock_vma(struct stack_map_vma_lock *lock)
+{
+ struct vm_area_struct *vma = lock->vma;
+
+ if (lock->vma_locked) {
+ if (WARN_ON_ONCE(!vma))
+ goto out;
+ vma_end_read(vma);
+ }
+out:
+ lock->vma_locked = false;
+ lock->vma = NULL;
+}
+
+static void stack_map_get_build_id_offset_sleepable(struct bpf_stack_build_id *id_offs,
+ u32 trace_nr)
+{
+ struct mm_struct *mm = current->mm;
+ struct stack_map_vma_lock lock = {
+ .vma_locked = false,
+ .vma = NULL,
+ .mm = mm,
+ };
+ unsigned long vm_pgoff, vm_start;
+ struct vm_area_struct *vma;
+ struct file *file;
+ u64 ip;
+
+ for (u32 i = 0; i < trace_nr; i++) {
+ ip = READ_ONCE(id_offs[i].ip);
+ vma = stack_map_lock_vma(&lock, ip);
+ if (!vma || !vma->vm_file) {
+ stack_map_build_id_set_ip(&id_offs[i]);
+ stack_map_unlock_vma(&lock);
+ continue;
+ }
+
+ file = get_file(vma->vm_file);
+ vm_pgoff = vma->vm_pgoff;
+ vm_start = vma->vm_start;
+ stack_map_unlock_vma(&lock);
+
+ /* build_id_parse_file() may block on filesystem reads */
+ if (build_id_parse_file(file, id_offs[i].build_id, NULL)) {
+ stack_map_build_id_set_ip(&id_offs[i]);
+ fput(file);
+ continue;
+ }
+ fput(file);
+
+ id_offs[i].offset = (vm_pgoff << PAGE_SHIFT) + ip - vm_start;
+ id_offs[i].status = BPF_STACK_BUILD_ID_VALID;
+ }
+}
+
/*
* Expects all id_offs[i].ip values to be set to correct initial IPs.
* They will be subsequently:
@@ -178,6 +278,11 @@ static void stack_map_get_build_id_offset(struct bpf_stack_build_id *id_offs,
const char *prev_build_id;
int i;
+ if (may_fault && has_user_ctx) {
+ stack_map_get_build_id_offset_sleepable(id_offs, trace_nr);
+ return;
+ }
+
/* If the irq_work is in use, fall back to report ips. Same
* fallback is used for kernel stack (!user) on a stackmap with
* build_id.
--
2.54.0
^ permalink raw reply related [flat|nested] 5+ messages in thread* Re: [PATCH bpf v3 2/3] bpf: Avoid faultable build ID reads under mm locks
2026-05-12 3:29 ` [PATCH bpf v3 2/3] bpf: Avoid faultable build ID reads under mm locks Ihor Solodrai
@ 2026-05-12 4:16 ` bot+bpf-ci
0 siblings, 0 replies; 5+ messages in thread
From: bot+bpf-ci @ 2026-05-12 4:16 UTC (permalink / raw)
To: ihor.solodrai, ast, andrii, daniel, eddyz87, memxor
Cc: puranjay, shakeel.butt, mykyta.yatsenko5, bpf, linux-kernel,
kernel-team, ast, andrii, daniel, martin.lau, eddyz87,
yonghong.song, clm, ihor.solodrai
[-- Attachment #1: Type: text/plain, Size: 6600 bytes --]
> diff --git a/kernel/bpf/stackmap.c b/kernel/bpf/stackmap.c
> index 4ef0fd06cea5..c1e96df360c3 100644
> --- a/kernel/bpf/stackmap.c
> +++ b/kernel/bpf/stackmap.c
[ ... ]
> @@ -158,6 +159,105 @@ static inline void stack_map_build_id_set_ip(struct bpf_stack_build_id *id)
> memset(id->build_id, 0, BUILD_ID_SIZE_MAX);
> }
>
> +struct stack_map_vma_lock {
> + bool vma_locked;
> + struct vm_area_struct *vma;
> + struct mm_struct *mm;
> +};
> +
> +static struct vm_area_struct *
> +stack_map_lock_vma(struct stack_map_vma_lock *lock, unsigned long ip)
> +{
> + struct mm_struct *mm = lock->mm;
> + struct vm_area_struct *vma;
> +
> + if (WARN_ON_ONCE(!mm))
> + return NULL;
> +
> + vma = lock_vma_under_rcu(mm, ip);
> + if (vma)
> + goto vma_locked;
> +
> + if (!mmap_read_trylock(mm))
> + return NULL;
> +
> + vma = vma_lookup(mm, ip);
> + if (!vma) {
> + mmap_read_unlock(mm);
> + return NULL;
> + }
> +
> +#ifdef CONFIG_PER_VMA_LOCK
> + if (!vma_start_read_locked(vma)) {
> + mmap_read_unlock(mm);
> + return NULL;
> + }
> + mmap_read_unlock(mm);
> +#else
> + mmap_read_unlock(mm);
> + return NULL;
> +#endif
> +vma_locked:
> + lock->vma_locked = true;
> + lock->vma = vma;
> + return vma;
> +}
> +
> +static void stack_map_unlock_vma(struct stack_map_vma_lock *lock)
> +{
> + struct vm_area_struct *vma = lock->vma;
> +
> + if (lock->vma_locked) {
> + if (WARN_ON_ONCE(!vma))
> + goto out;
> + vma_end_read(vma);
> + }
> +out:
> + lock->vma_locked = false;
> + lock->vma = NULL;
> +}
> +
> +static void stack_map_get_build_id_offset_sleepable(struct bpf_stack_build_id *id_offs,
> + u32 trace_nr)
> +{
> + struct mm_struct *mm = current->mm;
> + struct stack_map_vma_lock lock = {
> + .vma_locked = false,
> + .vma = NULL,
> + .mm = mm,
> + };
> + unsigned long vm_pgoff, vm_start;
> + struct vm_area_struct *vma;
> + struct file *file;
> + u64 ip;
> +
> + for (u32 i = 0; i < trace_nr; i++) {
> + ip = READ_ONCE(id_offs[i].ip);
> + vma = stack_map_lock_vma(&lock, ip);
> + if (!vma || !vma->vm_file) {
^^^^^^^^
A subsystem pattern flags this as potentially concerning: the code uses
!vma->vm_file to classify the VMA as anonymous. Per mm-vma.md's VMA
classification rules, the correct test for anonymous VMAs is
vma_is_anonymous() (i.e. !vma->vm_ops). Some anonymous VMAs carry a
vm_file reference (e.g. private mappings of /dev/zero set via
vma_set_anonymous() while leaving vm_file populated).
On such VMAs, this code would proceed down the file-backed path and hand
/dev/zero (or any other vma_set_anonymous()-style driver file) to
build_id_parse_file() instead of immediately falling back to reporting
the raw IP. build_id_parse_file() would fail the ELF-magic check and
return an error, so the practical effect is a wasted pinned-file
reference and a pointless sleepable read attempt. However, the mm
subsystem guide directs that this pattern be reported.
The existing non-sleepable stack_map_get_build_id_offset()/build_id_parse()
in this same subsystem also uses !vma->vm_file, so the new code is
consistent with pre-existing style.
> + stack_map_build_id_set_ip(&id_offs[i]);
> + stack_map_unlock_vma(&lock);
> + continue;
> + }
Mykyta Yatsenko suggested an optimization in v2 that has not been
implemented: cache vm_start and vm_end, and if the next IP is in range,
reuse the previous build ID without locking the VMA. This would avoid
repeated lock/unlock cycles for consecutive IPs in the same VMA.
The current v3 code only caches the file reference for same-file
optimization, not the VMA range. See the original discussion at
https://lore.kernel.org/bpf/8b2cf5ca-182e-4eb2-ab90-8fab9d81f6e1@gmail.com/
> +
> + file = get_file(vma->vm_file);
> + vm_pgoff = vma->vm_pgoff;
> + vm_start = vma->vm_start;
> + stack_map_unlock_vma(&lock);
> +
> + /* build_id_parse_file() may block on filesystem reads */
> + if (build_id_parse_file(file, id_offs[i].build_id, NULL)) {
> + stack_map_build_id_set_ip(&id_offs[i]);
> + fput(file);
> + continue;
> + }
> + fput(file);
> +
> + id_offs[i].offset = (vm_pgoff << PAGE_SHIFT) + ip - vm_start;
> + id_offs[i].status = BPF_STACK_BUILD_ID_VALID;
> + }
> +}
> +
> /*
> * Expects all id_offs[i].ip values to be set to correct initial IPs.
> * They will be subsequently:
> @@ -178,6 +278,11 @@ static void stack_map_get_build_id_offset(struct bpf_stack_build_id *id_offs,
> const char *prev_build_id;
> int i;
>
> + if (may_fault && has_user_ctx) {
> + stack_map_get_build_id_offset_sleepable(id_offs, trace_nr);
> + return;
> + }
> +
> /* If the irq_work is in use, fall back to report ips. Same
> * fallback is used for kernel stack (!user) on a stackmap with
> * build_id.
Commit message:
> bpf: Avoid faultable build ID reads under mm locks
>
> Sleepable build ID parsing can block in __kernel_read() [1], so the
> stackmap sleepable path must not call it while holding mmap_lock or a
> per-VMA read lock.
> The issue and the fix are conceptually similar to a recent procfs
> patch [2].
> Resolve each covered VMA with a stable read-side reference, preferring
> lock_vma_under_rcu() and falling back to mmap_read_trylock() only long
> enough to acquire the VMA read lock. Take a reference to the backing
> file, drop the VMA lock, and then parse the build ID through
> (sleepable) build_id_parse_file().
> [1]: https://lore.kernel.org/all/20251218005818.614819-1-shakeel.butt@linux.dev/
> [2]: https://lore.kernel.org/all/20260128183232.2854138-1-andrii@kernel.org/
> Fixes: 777a8560fd29 ("lib/buildid: use __kernel_read() for sleepable context")
The existing Fixes tag points to commit 777a8560fd29 ("lib/buildid: use
__kernel_read() for sleepable context"), but the bug was actually
introduced by commit d4dd9775ec24 ("bpf: wire up sleepable
bpf_get_stack() and bpf_get_task_stack() helpers").
The d4dd9775ec24 commit introduced the pattern of calling potentially
sleepable build_id_parse() while holding mm locks via find_vma(). Commit
777a8560fd29 only made the latent blocking behavior more severe by using
__kernel_read(), but the architectural issue of calling sleepable
functions under locks was introduced earlier.
Should this be:
Fixes: d4dd9775ec24 ("bpf: wire up sleepable bpf_get_stack() and bpf_get_task_stack() helpers")
---
AI reviewed your patch. Please fix the bug or email reply why it's not a bug.
See: https://github.com/kernel-patches/vmtest/blob/master/ci/claude/README.md
CI run summary: https://github.com/kernel-patches/bpf/actions/runs/25711733137
^ permalink raw reply [flat|nested] 5+ messages in thread
* [PATCH bpf v3 3/3] bpf: Cache build IDs in sleepable stackmap path
2026-05-12 3:29 [PATCH bpf v3 0/3] bpf: Implement stack_map_get_build_id_offset_sleepable() Ihor Solodrai
2026-05-12 3:29 ` [PATCH bpf v3 1/3] bpf: Factor out stack_map_build_id_set_ip() in stackmap.c Ihor Solodrai
2026-05-12 3:29 ` [PATCH bpf v3 2/3] bpf: Avoid faultable build ID reads under mm locks Ihor Solodrai
@ 2026-05-12 3:29 ` Ihor Solodrai
2 siblings, 0 replies; 5+ messages in thread
From: Ihor Solodrai @ 2026-05-12 3:29 UTC (permalink / raw)
To: Alexei Starovoitov, Andrii Nakryiko, Daniel Borkmann,
Eduard Zingerman, Kumar Kartikeya Dwivedi
Cc: Puranjay Mohan, Shakeel Butt, Mykyta Yatsenko, bpf, linux-kernel,
kernel-team
Stack traces often contain adjacent IPs from the same VMA or from
different VMAs backed by the same ELF file. Cache the last successfully
parsed build ID together with the resolved VMA range and backing file
so the sleepable build-ID path can avoid repeated VMA locking and file
parsing in common cases.
Suggested-by: Mykyta Yatsenko <mykyta.yatsenko5@gmail.com>
Signed-off-by: Ihor Solodrai <ihor.solodrai@linux.dev>
---
kernel/bpf/stackmap.c | 51 ++++++++++++++++++++++++++++++++++++++++---
1 file changed, 48 insertions(+), 3 deletions(-)
diff --git a/kernel/bpf/stackmap.c b/kernel/bpf/stackmap.c
index c1e96df360c3..318ce9ed0dd5 100644
--- a/kernel/bpf/stackmap.c
+++ b/kernel/bpf/stackmap.c
@@ -226,13 +226,34 @@ static void stack_map_get_build_id_offset_sleepable(struct bpf_stack_build_id *i
.vma = NULL,
.mm = mm,
};
- unsigned long vm_pgoff, vm_start;
+ struct {
+ struct file *file;
+ const char *build_id;
+ unsigned long vm_start;
+ unsigned long vm_end;
+ unsigned long vm_pgoff;
+ } cache = {};
+ unsigned long vm_pgoff, vm_start, vm_end;
struct vm_area_struct *vma;
struct file *file;
u64 ip;
for (u32 i = 0; i < trace_nr; i++) {
ip = READ_ONCE(id_offs[i].ip);
+
+ /*
+ * Range cache fast path: if ip falls within the previously
+ * resolved VMA range, reuse the cache build_id without
+ * re-acquiring the VMA lock.
+ */
+ if (cache.build_id && ip >= cache.vm_start && ip < cache.vm_end) {
+ memcpy(id_offs[i].build_id, cache.build_id, BUILD_ID_SIZE_MAX);
+ vm_start = cache.vm_start;
+ vm_end = cache.vm_end;
+ vm_pgoff = cache.vm_pgoff;
+ goto build_id_valid;
+ }
+
vma = stack_map_lock_vma(&lock, ip);
if (!vma || !vma->vm_file) {
stack_map_build_id_set_ip(&id_offs[i]);
@@ -240,9 +261,22 @@ static void stack_map_get_build_id_offset_sleepable(struct bpf_stack_build_id *i
continue;
}
- file = get_file(vma->vm_file);
+ file = vma->vm_file;
vm_pgoff = vma->vm_pgoff;
vm_start = vma->vm_start;
+ vm_end = vma->vm_end;
+
+ if (file == cache.file) {
+ /*
+ * Same backing file as previous (e.g. different VMAs
+ * of the same ELF binary). Reuse the cache build_id.
+ */
+ memcpy(id_offs[i].build_id, cache.build_id, BUILD_ID_SIZE_MAX);
+ stack_map_unlock_vma(&lock);
+ goto build_id_valid;
+ }
+
+ file = get_file(file);
stack_map_unlock_vma(&lock);
/* build_id_parse_file() may block on filesystem reads */
@@ -251,11 +285,22 @@ static void stack_map_get_build_id_offset_sleepable(struct bpf_stack_build_id *i
fput(file);
continue;
}
- fput(file);
+ if (cache.file)
+ fput(cache.file);
+ cache.file = file;
+ cache.build_id = id_offs[i].build_id;
+
+build_id_valid:
+ cache.vm_start = vm_start;
+ cache.vm_end = vm_end;
+ cache.vm_pgoff = vm_pgoff;
id_offs[i].offset = (vm_pgoff << PAGE_SHIFT) + ip - vm_start;
id_offs[i].status = BPF_STACK_BUILD_ID_VALID;
}
+
+ if (cache.file)
+ fput(cache.file);
}
/*
--
2.54.0
^ permalink raw reply related [flat|nested] 5+ messages in thread