Re: [PATCH bpf v2 2/2] bpf: Avoid faultable build ID reads under mm locks

All of lore.kernel.org
 help / color / mirror / Atom feed

From: Mykyta Yatsenko <mykyta.yatsenko5@gmail.com>
To: Ihor Solodrai <ihor.solodrai@linux.dev>,
	Alexei Starovoitov <ast@kernel.org>,
	Andrii Nakryiko <andrii@kernel.org>,
	Daniel Borkmann <daniel@iogearbox.net>,
	Song Liu <song@kernel.org>
Cc: Puranjay Mohan <puranjay@kernel.org>,
	Shakeel Butt <shakeel.butt@linux.dev>,
	bpf@vger.kernel.org, linux-kernel@vger.kernel.org,
	kernel-team@meta.com
Subject: Re: [PATCH bpf v2 2/2] bpf: Avoid faultable build ID reads under mm locks
Date: Thu, 9 Apr 2026 17:34:19 +0100	[thread overview]
Message-ID: <8b2cf5ca-182e-4eb2-ab90-8fab9d81f6e1@gmail.com> (raw)
In-Reply-To: <20260409010604.1439087-3-ihor.solodrai@linux.dev>



On 4/9/26 2:06 AM, Ihor Solodrai wrote:
> Sleepable build ID parsing can block in __kernel_read() [1], so the
> stackmap sleepable path must not call it while holding mmap_lock or a
> per-VMA read lock.
> 
> The issue and the fix are conceptually similar to a recent procfs
> patch [2].
> 
> Resolve each covered VMA with a stable read-side reference, preferring
> lock_vma_under_rcu() and falling back to mmap_read_lock() only long
meganit: falling back to mmap_read_trylock()?
> enough to acquire the VMA read lock. Take a reference to the backing
> file, drop the VMA lock, and then parse the build ID through
> (sleepable) build_id_parse_file().
> 
> [1]: https://lore.kernel.org/all/20251218005818.614819-1-shakeel.butt@linux.dev/
> [2]: https://lore.kernel.org/all/20260128183232.2854138-1-andrii@kernel.org/
> 
> Fixes: 777a8560fd29 ("lib/buildid: use __kernel_read() for sleepable context")
> Assisted-by: Codex:gpt-5.4
> Suggested-by: Puranjay Mohan <puranjay@kernel.org>
> Signed-off-by: Ihor Solodrai <ihor.solodrai@linux.dev>
> ---
>   kernel/bpf/stackmap.c | 139 ++++++++++++++++++++++++++++++++++++++++++
>   1 file changed, 139 insertions(+)
> 
> diff --git a/kernel/bpf/stackmap.c b/kernel/bpf/stackmap.c
> index 4ef0fd06cea5..de3d89e20a1e 100644
> --- a/kernel/bpf/stackmap.c
> +++ b/kernel/bpf/stackmap.c
> @@ -9,6 +9,7 @@
>   #include <linux/perf_event.h>
>   #include <linux/btf_ids.h>
>   #include <linux/buildid.h>
> +#include <linux/mmap_lock.h>
>   #include "percpu_freelist.h"
>   #include "mmap_unlock_work.h"
>   
> @@ -158,6 +159,139 @@ static inline void stack_map_build_id_set_ip(struct bpf_stack_build_id *id)
>   	memset(id->build_id, 0, BUILD_ID_SIZE_MAX);
>   }
>   
> +enum stack_map_vma_lock_state {
> +	STACK_MAP_LOCKED_NONE = 0,
> +	STACK_MAP_LOCKED_VMA,
> +	STACK_MAP_LOCKED_MMAP,
> +};
> +
> +struct stack_map_vma_lock {
> +	enum stack_map_vma_lock_state state;
> +	struct vm_area_struct *vma;
> +	struct mm_struct *mm;
> +};
> +
> +static struct vm_area_struct *stack_map_lock_vma(struct stack_map_vma_lock *lock, unsigned long ip)
> +{
> +	struct mm_struct *mm = lock->mm;
> +	struct vm_area_struct *vma;
> +
> +	if (WARN_ON_ONCE(!mm))
> +		return NULL;
> +
> +	vma = lock_vma_under_rcu(mm, ip);
> +	if (vma)
> +		goto vma_locked;
> +
> +	if (!mmap_read_trylock(mm))
> +		return NULL;
> +
> +	vma = vma_lookup(mm, ip);
> +	if (!vma) {
> +		mmap_read_unlock(mm);
> +		return NULL;
> +	}
> +
> +#ifdef CONFIG_PER_VMA_LOCK
> +	if (!vma_start_read_locked(vma)) {
> +		mmap_read_unlock(mm);
> +		return NULL;
> +	}
> +	mmap_read_unlock(mm);
> +#else
> +	lock->state = STACK_MAP_LOCKED_MMAP;
> +	lock->vma = vma;
> +	return vma;
> +#endif
> +
> +vma_locked:
> +	lock->state = STACK_MAP_LOCKED_VMA;
> +	lock->vma = vma;
> +	return vma;
> +}
> +
> +static void stack_map_unlock_vma(struct stack_map_vma_lock *lock)
> +{
> +	struct vm_area_struct *vma = lock->vma;
> +	struct mm_struct *mm = lock->mm;
> +
> +	switch (lock->state) {
> +	case STACK_MAP_LOCKED_VMA:
> +		if (WARN_ON_ONCE(!vma))
> +			break;
> +		vma_end_read(vma);
> +		break;
> +	case STACK_MAP_LOCKED_MMAP:
> +		if (WARN_ON_ONCE(!mm))
> +			break;
> +		mmap_read_unlock(mm);
> +		break;
> +	default:
> +		break;
> +	}
> +
> +	lock->state = STACK_MAP_LOCKED_NONE;
> +	lock->vma = NULL;
> +}
> +
> +static void stack_map_get_build_id_offset_sleepable(struct bpf_stack_build_id *id_offs,
> +						    u32 trace_nr)
> +{
> +	struct mm_struct *mm = current->mm;
> +	struct stack_map_vma_lock lock = {
> +		.state = STACK_MAP_LOCKED_NONE,
> +		.vma = NULL,
> +		.mm = mm,
> +	};
> +	struct file *file, *prev_file = NULL;
> +	unsigned long vm_pgoff, vm_start;
> +	struct vm_area_struct *vma;
> +	const char *prev_build_id;
> +	u64 ip;
> +
> +	for (u32 i = 0; i < trace_nr; i++) {
> +		ip = READ_ONCE(id_offs[i].ip);

I'm not sure if I understand why READ_ONCE is necessary here.

> +		vma = stack_map_lock_vma(&lock, ip);
> +		if (!vma || !vma->vm_file) {
> +			stack_map_build_id_set_ip(&id_offs[i]);
> +			stack_map_unlock_vma(&lock);
> +			continue;
> +		}
> +
> +		file = vma->vm_file;
> +		vm_pgoff = vma->vm_pgoff;
> +		vm_start = vma->vm_start;
> +
> +		if (file == prev_file) {

What if instead of caching prev_file, we cache vm_start and vm_end, and 
if the next IP is in range, reuse previous build id. This should 
optimize this code further, avoiding locks on the vma used on previous 
iteration.

> +			memcpy(id_offs[i].build_id, prev_build_id, BUILD_ID_SIZE_MAX);
> +			stack_map_unlock_vma(&lock);
> +			goto build_id_valid;
> +		}
> +
> +		file = get_file(file);
> +		stack_map_unlock_vma(&lock);
> +
> +		/* build_id_parse_file() may block on filesystem reads */
> +		if (build_id_parse_file(file, id_offs[i].build_id, NULL)) {
> +			stack_map_build_id_set_ip(&id_offs[i]);
> +			fput(file);
> +			continue;
> +		}
> +
> +		if (prev_file)
> +			fput(prev_file);
> +		prev_file = file;
> +		prev_build_id = id_offs[i].build_id;
> +
> +build_id_valid:
> +		id_offs[i].offset = (vm_pgoff << PAGE_SHIFT) + ip - vm_start;
> +		id_offs[i].status = BPF_STACK_BUILD_ID_VALID;
> +	}
> +
> +	if (prev_file)
> +		fput(prev_file);
> +}
> +
>   /*
>    * Expects all id_offs[i].ip values to be set to correct initial IPs.
>    * They will be subsequently:
> @@ -178,6 +312,11 @@ static void stack_map_get_build_id_offset(struct bpf_stack_build_id *id_offs,
>   	const char *prev_build_id;
>   	int i;
>   
> +	if (may_fault && has_user_ctx) {
> +		stack_map_get_build_id_offset_sleepable(id_offs, trace_nr);
> +		return;
> +	}
> +
>   	/* If the irq_work is in use, fall back to report ips. Same
>   	 * fallback is used for kernel stack (!user) on a stackmap with
>   	 * build_id.

next prev parent reply	other threads:[~2026-04-09 16:34 UTC|newest]

Thread overview: 8+ messages / expand[flat|nested]  mbox.gz  Atom feed  top
2026-04-09  1:06 [PATCH bpf v2 0/2] bpf: Implement stack_map_get_build_id_offset_sleepable() Ihor Solodrai
2026-04-09  1:06 ` [PATCH bpf v2 1/2] bpf: Factor out stack_map_build_id_set_ip() in stackmap.c Ihor Solodrai
2026-04-09 14:01   ` Mykyta Yatsenko
2026-04-09  1:06 ` [PATCH bpf v2 2/2] bpf: Avoid faultable build ID reads under mm locks Ihor Solodrai
2026-04-09  4:20   ` Ihor Solodrai
2026-04-09 14:51     ` Mykyta Yatsenko
2026-04-09 16:34   ` Mykyta Yatsenko [this message]
2026-05-12  3:40     ` Ihor Solodrai

Reply instructions:

You may reply publicly to this message via plain-text email
using any one of the following methods:

* Save the following mbox file, import it into your mail client,
  and reply-to-all from there: mbox

  Avoid top-posting and favor interleaved quoting:
  https://en.wikipedia.org/wiki/Posting_style#Interleaved_style

* Reply using the --to, --cc, and --in-reply-to
  switches of git-send-email(1):

  git send-email \
    --in-reply-to=8b2cf5ca-182e-4eb2-ab90-8fab9d81f6e1@gmail.com \
    --to=mykyta.yatsenko5@gmail.com \
    --cc=andrii@kernel.org \
    --cc=ast@kernel.org \
    --cc=bpf@vger.kernel.org \
    --cc=daniel@iogearbox.net \
    --cc=ihor.solodrai@linux.dev \
    --cc=kernel-team@meta.com \
    --cc=linux-kernel@vger.kernel.org \
    --cc=puranjay@kernel.org \
    --cc=shakeel.butt@linux.dev \
    --cc=song@kernel.org \
    /path/to/YOUR_REPLY

  https://kernel.org/pub/software/scm/git/docs/git-send-email.html

* If your mail client supports setting the In-Reply-To header
  via mailto: links, try the mailto: link

Be sure your reply has a Subject: header at the top and a blank line before the message body.

This is an external index of several public inboxes,
see mirroring instructions on how to clone and mirror
all data and code used by this external index.