* [PATCH bpf v2 1/4] bpf: rename mmap_unlock_irq_work to bpf_iter_mm_irq_work
2026-03-09 15:54 [PATCH bpf v2 0/4] bpf: fix and improve open-coded task_vma iterator Puranjay Mohan
@ 2026-03-09 15:54 ` Puranjay Mohan
2026-03-11 18:32 ` Andrii Nakryiko
2026-03-09 15:54 ` [PATCH bpf v2 2/4] bpf: fix mm lifecycle in open-coded task_vma iterator Puranjay Mohan
` (2 subsequent siblings)
3 siblings, 1 reply; 17+ messages in thread
From: Puranjay Mohan @ 2026-03-09 15:54 UTC (permalink / raw)
To: bpf
Cc: Puranjay Mohan, Puranjay Mohan, Alexei Starovoitov,
Andrii Nakryiko, Daniel Borkmann, Martin KaFai Lau,
Eduard Zingerman, Kumar Kartikeya Dwivedi, Mykyta Yatsenko,
kernel-team
The next commit will be reusing mmap_unlock_irq_work to do mmput_async()
in irq_work. Rather than creating a new structure, rename
mmap_unlock_irq_work to bpf_iter_mm_irq_work and reuse it.
This is only a rename with no functional changes.
Signed-off-by: Puranjay Mohan <puranjay@kernel.org>
---
kernel/bpf/mmap_unlock_work.h | 12 ++++++------
kernel/bpf/stackmap.c | 2 +-
kernel/bpf/task_iter.c | 12 ++++++------
3 files changed, 13 insertions(+), 13 deletions(-)
diff --git a/kernel/bpf/mmap_unlock_work.h b/kernel/bpf/mmap_unlock_work.h
index 5d18d7d85bef..ba8e860f1180 100644
--- a/kernel/bpf/mmap_unlock_work.h
+++ b/kernel/bpf/mmap_unlock_work.h
@@ -6,13 +6,13 @@
#define __MMAP_UNLOCK_WORK_H__
#include <linux/irq_work.h>
-/* irq_work to run mmap_read_unlock() in irq_work */
-struct mmap_unlock_irq_work {
+/* irq_work to run mmap_read_unlock() or mmput_async() in irq_work */
+struct bpf_iter_mm_irq_work {
struct irq_work irq_work;
struct mm_struct *mm;
};
-DECLARE_PER_CPU(struct mmap_unlock_irq_work, mmap_unlock_work);
+DECLARE_PER_CPU(struct bpf_iter_mm_irq_work, mmap_unlock_work);
/*
* We cannot do mmap_read_unlock() when the irq is disabled, because of
@@ -21,9 +21,9 @@ DECLARE_PER_CPU(struct mmap_unlock_irq_work, mmap_unlock_work);
* percpu variable to do the irq_work. If the irq_work is already used
* by another lookup, we fall over.
*/
-static inline bool bpf_mmap_unlock_get_irq_work(struct mmap_unlock_irq_work **work_ptr)
+static inline bool bpf_mmap_unlock_get_irq_work(struct bpf_iter_mm_irq_work **work_ptr)
{
- struct mmap_unlock_irq_work *work = NULL;
+ struct bpf_iter_mm_irq_work *work = NULL;
bool irq_work_busy = false;
if (irqs_disabled()) {
@@ -46,7 +46,7 @@ static inline bool bpf_mmap_unlock_get_irq_work(struct mmap_unlock_irq_work **wo
return irq_work_busy;
}
-static inline void bpf_mmap_unlock_mm(struct mmap_unlock_irq_work *work, struct mm_struct *mm)
+static inline void bpf_mmap_unlock_mm(struct bpf_iter_mm_irq_work *work, struct mm_struct *mm)
{
if (!work) {
mmap_read_unlock(mm);
diff --git a/kernel/bpf/stackmap.c b/kernel/bpf/stackmap.c
index da3d328f5c15..7ef1042a3448 100644
--- a/kernel/bpf/stackmap.c
+++ b/kernel/bpf/stackmap.c
@@ -166,7 +166,7 @@ static void stack_map_get_build_id_offset(struct bpf_stack_build_id *id_offs,
u32 trace_nr, bool user, bool may_fault)
{
int i;
- struct mmap_unlock_irq_work *work = NULL;
+ struct bpf_iter_mm_irq_work *work = NULL;
bool irq_work_busy = bpf_mmap_unlock_get_irq_work(&work);
struct vm_area_struct *vma, *prev_vma = NULL;
const char *prev_build_id;
diff --git a/kernel/bpf/task_iter.c b/kernel/bpf/task_iter.c
index 98d9b4c0daff..7c302ee78f7e 100644
--- a/kernel/bpf/task_iter.c
+++ b/kernel/bpf/task_iter.c
@@ -751,7 +751,7 @@ static struct bpf_iter_reg task_vma_reg_info = {
BPF_CALL_5(bpf_find_vma, struct task_struct *, task, u64, start,
bpf_callback_t, callback_fn, void *, callback_ctx, u64, flags)
{
- struct mmap_unlock_irq_work *work = NULL;
+ struct bpf_iter_mm_irq_work *work = NULL;
struct vm_area_struct *vma;
bool irq_work_busy = false;
struct mm_struct *mm;
@@ -797,7 +797,7 @@ const struct bpf_func_proto bpf_find_vma_proto = {
struct bpf_iter_task_vma_kern_data {
struct task_struct *task;
struct mm_struct *mm;
- struct mmap_unlock_irq_work *work;
+ struct bpf_iter_mm_irq_work *work;
struct vma_iterator vmi;
};
@@ -1029,22 +1029,22 @@ __bpf_kfunc void bpf_iter_task_destroy(struct bpf_iter_task *it)
__bpf_kfunc_end_defs();
-DEFINE_PER_CPU(struct mmap_unlock_irq_work, mmap_unlock_work);
+DEFINE_PER_CPU(struct bpf_iter_mm_irq_work, mmap_unlock_work);
static void do_mmap_read_unlock(struct irq_work *entry)
{
- struct mmap_unlock_irq_work *work;
+ struct bpf_iter_mm_irq_work *work;
if (WARN_ON_ONCE(IS_ENABLED(CONFIG_PREEMPT_RT)))
return;
- work = container_of(entry, struct mmap_unlock_irq_work, irq_work);
+ work = container_of(entry, struct bpf_iter_mm_irq_work, irq_work);
mmap_read_unlock_non_owner(work->mm);
}
static int __init task_iter_init(void)
{
- struct mmap_unlock_irq_work *work;
+ struct bpf_iter_mm_irq_work *work;
int ret, cpu;
for_each_possible_cpu(cpu) {
--
2.47.3
^ permalink raw reply related [flat|nested] 17+ messages in thread* Re: [PATCH bpf v2 1/4] bpf: rename mmap_unlock_irq_work to bpf_iter_mm_irq_work
2026-03-09 15:54 ` [PATCH bpf v2 1/4] bpf: rename mmap_unlock_irq_work to bpf_iter_mm_irq_work Puranjay Mohan
@ 2026-03-11 18:32 ` Andrii Nakryiko
0 siblings, 0 replies; 17+ messages in thread
From: Andrii Nakryiko @ 2026-03-11 18:32 UTC (permalink / raw)
To: Puranjay Mohan
Cc: bpf, Puranjay Mohan, Alexei Starovoitov, Andrii Nakryiko,
Daniel Borkmann, Martin KaFai Lau, Eduard Zingerman,
Kumar Kartikeya Dwivedi, Mykyta Yatsenko, kernel-team
On Mon, Mar 9, 2026 at 8:55 AM Puranjay Mohan <puranjay@kernel.org> wrote:
>
> The next commit will be reusing mmap_unlock_irq_work to do mmput_async()
> in irq_work. Rather than creating a new structure, rename
> mmap_unlock_irq_work to bpf_iter_mm_irq_work and reuse it.
>
> This is only a rename with no functional changes.
>
> Signed-off-by: Puranjay Mohan <puranjay@kernel.org>
> ---
> kernel/bpf/mmap_unlock_work.h | 12 ++++++------
> kernel/bpf/stackmap.c | 2 +-
> kernel/bpf/task_iter.c | 12 ++++++------
> 3 files changed, 13 insertions(+), 13 deletions(-)
>
> diff --git a/kernel/bpf/mmap_unlock_work.h b/kernel/bpf/mmap_unlock_work.h
> index 5d18d7d85bef..ba8e860f1180 100644
> --- a/kernel/bpf/mmap_unlock_work.h
> +++ b/kernel/bpf/mmap_unlock_work.h
> @@ -6,13 +6,13 @@
> #define __MMAP_UNLOCK_WORK_H__
> #include <linux/irq_work.h>
>
> -/* irq_work to run mmap_read_unlock() in irq_work */
> -struct mmap_unlock_irq_work {
> +/* irq_work to run mmap_read_unlock() or mmput_async() in irq_work */
> +struct bpf_iter_mm_irq_work {
given this is used from bpf_find_vma() and
stack_map_get_build_id_offset(), why does it make sense to call this
"bpf_iter_"-something?
and while for those two per-CPU is a necessary evil/compromise, for
vma iterator, why can't we just have irq_work as part of
bpf_iter_task_vma_kern_data? it's dynamically allocated, so no problem
increasing its size. We won't have to have unpredictable same CPU
random "busyness problems"...
> struct irq_work irq_work;
> struct mm_struct *mm;
> };
>
[...]
^ permalink raw reply [flat|nested] 17+ messages in thread
* [PATCH bpf v2 2/4] bpf: fix mm lifecycle in open-coded task_vma iterator
2026-03-09 15:54 [PATCH bpf v2 0/4] bpf: fix and improve open-coded task_vma iterator Puranjay Mohan
2026-03-09 15:54 ` [PATCH bpf v2 1/4] bpf: rename mmap_unlock_irq_work to bpf_iter_mm_irq_work Puranjay Mohan
@ 2026-03-09 15:54 ` Puranjay Mohan
2026-03-09 16:48 ` Alexei Starovoitov
2026-03-11 18:35 ` Andrii Nakryiko
2026-03-09 15:54 ` [PATCH bpf v2 3/4] bpf: switch task_vma iterator from mmap_lock to per-VMA locks Puranjay Mohan
2026-03-09 15:54 ` [PATCH bpf v2 4/4] bpf: return VMA snapshot from task_vma iterator Puranjay Mohan
3 siblings, 2 replies; 17+ messages in thread
From: Puranjay Mohan @ 2026-03-09 15:54 UTC (permalink / raw)
To: bpf
Cc: Puranjay Mohan, Puranjay Mohan, Alexei Starovoitov,
Andrii Nakryiko, Daniel Borkmann, Martin KaFai Lau,
Eduard Zingerman, Kumar Kartikeya Dwivedi, Mykyta Yatsenko,
kernel-team
The open-coded task_vma iterator reads task->mm and acquires
mmap_read_trylock() but never calls mmget(). The mm can reach
mm_users == 0 if the task exits while the iterator holds the lock.
Add mmget_not_zero() before mmap_read_trylock(). Drop the mm reference
on error paths via mmput_async(), which is safe from process context.
From hardirq/NMI context mmput_async() can re-enter __queue_work() and
deadlock on pool->lock, so defer to irq_work in that case, following
the pattern from defer_timer_wq_op() in kernel/bpf/helpers.c.
Widen the mmput_async() #if guard to include CONFIG_BPF_SYSCALL,
following the same approach used for CONFIG_FUTEX_PRIVATE_HASH.
Fixes: 4ac454682158 ("bpf: Introduce task_vma open-coded iterator kfuncs")
Signed-off-by: Puranjay Mohan <puranjay@kernel.org>
---
include/linux/sched/mm.h | 2 +-
kernel/bpf/task_iter.c | 65 +++++++++++++++++++++++++++++++++++++---
kernel/fork.c | 2 +-
3 files changed, 63 insertions(+), 6 deletions(-)
diff --git a/include/linux/sched/mm.h b/include/linux/sched/mm.h
index 95d0040df584..5908de0c2f82 100644
--- a/include/linux/sched/mm.h
+++ b/include/linux/sched/mm.h
@@ -140,7 +140,7 @@ static inline bool mmget_not_zero(struct mm_struct *mm)
/* mmput gets rid of the mappings and all user-space */
extern void mmput(struct mm_struct *);
-#if defined(CONFIG_MMU) || defined(CONFIG_FUTEX_PRIVATE_HASH)
+#if defined(CONFIG_MMU) || defined(CONFIG_FUTEX_PRIVATE_HASH) || defined(CONFIG_BPF_SYSCALL)
/* same as above but performs the slow path from the async context. Can
* be called from the atomic context as well
*/
diff --git a/kernel/bpf/task_iter.c b/kernel/bpf/task_iter.c
index 7c302ee78f7e..e8efc9e1f602 100644
--- a/kernel/bpf/task_iter.c
+++ b/kernel/bpf/task_iter.c
@@ -9,6 +9,7 @@
#include <linux/bpf_mem_alloc.h>
#include <linux/btf_ids.h>
#include <linux/mm_types.h>
+#include <linux/sched/mm.h>
#include "mmap_unlock_work.h"
static const char * const iter_task_type_names[] = {
@@ -813,6 +814,44 @@ struct bpf_iter_task_vma_kern {
struct bpf_iter_task_vma_kern_data *data;
} __attribute__((aligned(8)));
+/*
+ * mmput_async() is unsafe when IRQs are disabled because it can re-enter
+ * __queue_work() and deadlock on pool->lock. Defer to irq_work in that case,
+ * same pattern as defer_timer_wq_op() in kernel/bpf/helpers.c.
+ */
+static DEFINE_PER_CPU(struct bpf_iter_mm_irq_work, bpf_iter_mmput_work);
+
+static void do_bpf_iter_mmput(struct irq_work *entry)
+{
+ struct bpf_iter_mm_irq_work *work;
+
+ work = container_of(entry, struct bpf_iter_mm_irq_work, irq_work);
+ if (work->mm) {
+ mmput_async(work->mm);
+ work->mm = NULL;
+ }
+}
+
+static void bpf_iter_mmput(struct mm_struct *mm)
+{
+ if (!in_hardirq() && !irqs_disabled()) {
+ mmput_async(mm);
+ } else {
+ struct bpf_iter_mm_irq_work *work;
+
+ work = this_cpu_ptr(&bpf_iter_mmput_work);
+ work->mm = mm;
+ irq_work_queue(&work->irq_work);
+ }
+}
+
+static bool bpf_iter_mmput_busy(void)
+{
+ if (!in_hardirq() && !irqs_disabled())
+ return false;
+ return irq_work_is_busy(&this_cpu_ptr(&bpf_iter_mmput_work)->irq_work);
+}
+
__bpf_kfunc_start_defs();
__bpf_kfunc int bpf_iter_task_vma_new(struct bpf_iter_task_vma *it,
@@ -840,19 +879,33 @@ __bpf_kfunc int bpf_iter_task_vma_new(struct bpf_iter_task_vma *it,
goto err_cleanup_iter;
}
- /* kit->data->work == NULL is valid after bpf_mmap_unlock_get_irq_work */
+ /*
+ * Both mmap_lock and mmput irq_work slots must be free for _destroy().
+ * kit->data->work == NULL is valid after bpf_mmap_unlock_get_irq_work
+ */
irq_work_busy = bpf_mmap_unlock_get_irq_work(&kit->data->work);
- if (irq_work_busy || !mmap_read_trylock(kit->data->mm)) {
+ if (irq_work_busy || bpf_iter_mmput_busy()) {
err = -EBUSY;
goto err_cleanup_iter;
}
+ if (!mmget_not_zero(kit->data->mm)) {
+ err = -ENOENT;
+ goto err_cleanup_iter;
+ }
+
+ if (!mmap_read_trylock(kit->data->mm)) {
+ err = -EBUSY;
+ goto err_cleanup_mmget;
+ }
+
vma_iter_init(&kit->data->vmi, kit->data->mm, addr);
return 0;
+err_cleanup_mmget:
+ bpf_iter_mmput(kit->data->mm);
err_cleanup_iter:
- if (kit->data->task)
- put_task_struct(kit->data->task);
+ put_task_struct(kit->data->task);
bpf_mem_free(&bpf_global_ma, kit->data);
/* NULL kit->data signals failed bpf_iter_task_vma initialization */
kit->data = NULL;
@@ -874,6 +927,7 @@ __bpf_kfunc void bpf_iter_task_vma_destroy(struct bpf_iter_task_vma *it)
if (kit->data) {
bpf_mmap_unlock_mm(kit->data->work, kit->data->mm);
+ bpf_iter_mmput(kit->data->mm);
put_task_struct(kit->data->task);
bpf_mem_free(&bpf_global_ma, kit->data);
}
@@ -1044,12 +1098,15 @@ static void do_mmap_read_unlock(struct irq_work *entry)
static int __init task_iter_init(void)
{
+ struct bpf_iter_mm_irq_work *mmput_work;
struct bpf_iter_mm_irq_work *work;
int ret, cpu;
for_each_possible_cpu(cpu) {
work = per_cpu_ptr(&mmap_unlock_work, cpu);
init_irq_work(&work->irq_work, do_mmap_read_unlock);
+ mmput_work = per_cpu_ptr(&bpf_iter_mmput_work, cpu);
+ init_irq_work(&mmput_work->irq_work, do_bpf_iter_mmput);
}
task_reg_info.ctx_arg_info[0].btf_id = btf_tracing_ids[BTF_TRACING_TYPE_TASK];
diff --git a/kernel/fork.c b/kernel/fork.c
index 65113a304518..d0411a63d4ab 100644
--- a/kernel/fork.c
+++ b/kernel/fork.c
@@ -1198,7 +1198,7 @@ void mmput(struct mm_struct *mm)
}
EXPORT_SYMBOL_GPL(mmput);
-#if defined(CONFIG_MMU) || defined(CONFIG_FUTEX_PRIVATE_HASH)
+#if defined(CONFIG_MMU) || defined(CONFIG_FUTEX_PRIVATE_HASH) || defined(CONFIG_BPF_SYSCALL)
static void mmput_async_fn(struct work_struct *work)
{
struct mm_struct *mm = container_of(work, struct mm_struct,
--
2.47.3
^ permalink raw reply related [flat|nested] 17+ messages in thread* Re: [PATCH bpf v2 2/4] bpf: fix mm lifecycle in open-coded task_vma iterator
2026-03-09 15:54 ` [PATCH bpf v2 2/4] bpf: fix mm lifecycle in open-coded task_vma iterator Puranjay Mohan
@ 2026-03-09 16:48 ` Alexei Starovoitov
2026-03-09 18:02 ` Puranjay Mohan
2026-03-11 18:35 ` Andrii Nakryiko
1 sibling, 1 reply; 17+ messages in thread
From: Alexei Starovoitov @ 2026-03-09 16:48 UTC (permalink / raw)
To: Puranjay Mohan
Cc: bpf, Puranjay Mohan, Alexei Starovoitov, Andrii Nakryiko,
Daniel Borkmann, Martin KaFai Lau, Eduard Zingerman,
Kumar Kartikeya Dwivedi, Mykyta Yatsenko, Kernel Team
On Mon, Mar 9, 2026 at 8:55 AM Puranjay Mohan <puranjay@kernel.org> wrote:
>
> The open-coded task_vma iterator reads task->mm and acquires
> mmap_read_trylock() but never calls mmget(). The mm can reach
> mm_users == 0 if the task exits while the iterator holds the lock.
>
> Add mmget_not_zero() before mmap_read_trylock(). Drop the mm reference
> on error paths via mmput_async(), which is safe from process context.
> From hardirq/NMI context mmput_async() can re-enter __queue_work() and
hardirq can reenter __queue_work() ?
Did _you_ think about it or AI told you?
> + if (!in_hardirq() && !irqs_disabled()) {
> + mmput_async(mm);
Don't respin without answering the above.
^ permalink raw reply [flat|nested] 17+ messages in thread* Re: [PATCH bpf v2 2/4] bpf: fix mm lifecycle in open-coded task_vma iterator
2026-03-09 16:48 ` Alexei Starovoitov
@ 2026-03-09 18:02 ` Puranjay Mohan
2026-03-09 18:12 ` Alexei Starovoitov
0 siblings, 1 reply; 17+ messages in thread
From: Puranjay Mohan @ 2026-03-09 18:02 UTC (permalink / raw)
To: Alexei Starovoitov
Cc: bpf, Alexei Starovoitov, Andrii Nakryiko, Daniel Borkmann,
Martin KaFai Lau, Eduard Zingerman, Kumar Kartikeya Dwivedi,
Mykyta Yatsenko, Kernel Team
On Mon, Mar 9, 2026 at 4:48 PM Alexei Starovoitov
<alexei.starovoitov@gmail.com> wrote:
>
> On Mon, Mar 9, 2026 at 8:55 AM Puranjay Mohan <puranjay@kernel.org> wrote:
> >
> > The open-coded task_vma iterator reads task->mm and acquires
> > mmap_read_trylock() but never calls mmget(). The mm can reach
> > mm_users == 0 if the task exits while the iterator holds the lock.
> >
> > Add mmget_not_zero() before mmap_read_trylock(). Drop the mm reference
> > on error paths via mmput_async(), which is safe from process context.
> > From hardirq/NMI context mmput_async() can re-enter __queue_work() and
>
> hardirq can reenter __queue_work() ?
No.
I blindly copied from defer_timer_wq_op(), sorry for that, I should
have thought more and looked at the lockdep_assert_irqs_disabled() in
__queue_work() which means hardirq can't re enter.
We only need to check !irqs_disabled() here as reentry can happen only
from bpf programs attached to tracepoints in the call chain of
__queue_work().
> Did _you_ think about it or AI told you?
>
> > + if (!in_hardirq() && !irqs_disabled()) {
> > + mmput_async(mm);
>
> Don't respin without answering the above.
^ permalink raw reply [flat|nested] 17+ messages in thread* Re: [PATCH bpf v2 2/4] bpf: fix mm lifecycle in open-coded task_vma iterator
2026-03-09 18:02 ` Puranjay Mohan
@ 2026-03-09 18:12 ` Alexei Starovoitov
0 siblings, 0 replies; 17+ messages in thread
From: Alexei Starovoitov @ 2026-03-09 18:12 UTC (permalink / raw)
To: Puranjay Mohan
Cc: bpf, Alexei Starovoitov, Andrii Nakryiko, Daniel Borkmann,
Martin KaFai Lau, Eduard Zingerman, Kumar Kartikeya Dwivedi,
Mykyta Yatsenko, Kernel Team
On Mon, Mar 9, 2026 at 11:02 AM Puranjay Mohan <puranjay12@gmail.com> wrote:
>
> On Mon, Mar 9, 2026 at 4:48 PM Alexei Starovoitov
> <alexei.starovoitov@gmail.com> wrote:
> >
> > On Mon, Mar 9, 2026 at 8:55 AM Puranjay Mohan <puranjay@kernel.org> wrote:
> > >
> > > The open-coded task_vma iterator reads task->mm and acquires
> > > mmap_read_trylock() but never calls mmget(). The mm can reach
> > > mm_users == 0 if the task exits while the iterator holds the lock.
> > >
> > > Add mmget_not_zero() before mmap_read_trylock(). Drop the mm reference
> > > on error paths via mmput_async(), which is safe from process context.
> > > From hardirq/NMI context mmput_async() can re-enter __queue_work() and
> >
> > hardirq can reenter __queue_work() ?
>
> No.
>
> I blindly copied from defer_timer_wq_op(), sorry for that, I should
> have thought more and looked at the lockdep_assert_irqs_disabled() in
> __queue_work() which means hardirq can't re enter.
> We only need to check !irqs_disabled() here as reentry can happen only
> from bpf programs attached to tracepoints in the call chain of
> __queue_work().
Exactly.
Please add a comment.
It will make agent's analysis easier too.
^ permalink raw reply [flat|nested] 17+ messages in thread
* Re: [PATCH bpf v2 2/4] bpf: fix mm lifecycle in open-coded task_vma iterator
2026-03-09 15:54 ` [PATCH bpf v2 2/4] bpf: fix mm lifecycle in open-coded task_vma iterator Puranjay Mohan
2026-03-09 16:48 ` Alexei Starovoitov
@ 2026-03-11 18:35 ` Andrii Nakryiko
1 sibling, 0 replies; 17+ messages in thread
From: Andrii Nakryiko @ 2026-03-11 18:35 UTC (permalink / raw)
To: Puranjay Mohan
Cc: bpf, Puranjay Mohan, Alexei Starovoitov, Andrii Nakryiko,
Daniel Borkmann, Martin KaFai Lau, Eduard Zingerman,
Kumar Kartikeya Dwivedi, Mykyta Yatsenko, kernel-team
On Mon, Mar 9, 2026 at 8:55 AM Puranjay Mohan <puranjay@kernel.org> wrote:
>
> The open-coded task_vma iterator reads task->mm and acquires
> mmap_read_trylock() but never calls mmget(). The mm can reach
> mm_users == 0 if the task exits while the iterator holds the lock.
>
> Add mmget_not_zero() before mmap_read_trylock(). Drop the mm reference
> on error paths via mmput_async(), which is safe from process context.
> From hardirq/NMI context mmput_async() can re-enter __queue_work() and
> deadlock on pool->lock, so defer to irq_work in that case, following
> the pattern from defer_timer_wq_op() in kernel/bpf/helpers.c.
>
> Widen the mmput_async() #if guard to include CONFIG_BPF_SYSCALL,
> following the same approach used for CONFIG_FUTEX_PRIVATE_HASH.
>
> Fixes: 4ac454682158 ("bpf: Introduce task_vma open-coded iterator kfuncs")
> Signed-off-by: Puranjay Mohan <puranjay@kernel.org>
> ---
> include/linux/sched/mm.h | 2 +-
> kernel/bpf/task_iter.c | 65 +++++++++++++++++++++++++++++++++++++---
> kernel/fork.c | 2 +-
> 3 files changed, 63 insertions(+), 6 deletions(-)
>
> diff --git a/include/linux/sched/mm.h b/include/linux/sched/mm.h
> index 95d0040df584..5908de0c2f82 100644
> --- a/include/linux/sched/mm.h
> +++ b/include/linux/sched/mm.h
> @@ -140,7 +140,7 @@ static inline bool mmget_not_zero(struct mm_struct *mm)
>
> /* mmput gets rid of the mappings and all user-space */
> extern void mmput(struct mm_struct *);
> -#if defined(CONFIG_MMU) || defined(CONFIG_FUTEX_PRIVATE_HASH)
> +#if defined(CONFIG_MMU) || defined(CONFIG_FUTEX_PRIVATE_HASH) || defined(CONFIG_BPF_SYSCALL)
> /* same as above but performs the slow path from the async context. Can
> * be called from the atomic context as well
> */
> diff --git a/kernel/bpf/task_iter.c b/kernel/bpf/task_iter.c
> index 7c302ee78f7e..e8efc9e1f602 100644
> --- a/kernel/bpf/task_iter.c
> +++ b/kernel/bpf/task_iter.c
> @@ -9,6 +9,7 @@
> #include <linux/bpf_mem_alloc.h>
> #include <linux/btf_ids.h>
> #include <linux/mm_types.h>
> +#include <linux/sched/mm.h>
> #include "mmap_unlock_work.h"
>
> static const char * const iter_task_type_names[] = {
> @@ -813,6 +814,44 @@ struct bpf_iter_task_vma_kern {
> struct bpf_iter_task_vma_kern_data *data;
> } __attribute__((aligned(8)));
>
> +/*
> + * mmput_async() is unsafe when IRQs are disabled because it can re-enter
> + * __queue_work() and deadlock on pool->lock. Defer to irq_work in that case,
> + * same pattern as defer_timer_wq_op() in kernel/bpf/helpers.c.
> + */
> +static DEFINE_PER_CPU(struct bpf_iter_mm_irq_work, bpf_iter_mmput_work);
> +
> +static void do_bpf_iter_mmput(struct irq_work *entry)
> +{
> + struct bpf_iter_mm_irq_work *work;
> +
> + work = container_of(entry, struct bpf_iter_mm_irq_work, irq_work);
> + if (work->mm) {
> + mmput_async(work->mm);
> + work->mm = NULL;
> + }
> +}
> +
> +static void bpf_iter_mmput(struct mm_struct *mm)
> +{
> + if (!in_hardirq() && !irqs_disabled()) {
> + mmput_async(mm);
> + } else {
> + struct bpf_iter_mm_irq_work *work;
> +
> + work = this_cpu_ptr(&bpf_iter_mmput_work);
> + work->mm = mm;
> + irq_work_queue(&work->irq_work);
> + }
> +}
> +
> +static bool bpf_iter_mmput_busy(void)
> +{
> + if (!in_hardirq() && !irqs_disabled())
> + return false;
> + return irq_work_is_busy(&this_cpu_ptr(&bpf_iter_mmput_work)->irq_work);
> +}
> +
> __bpf_kfunc_start_defs();
>
> __bpf_kfunc int bpf_iter_task_vma_new(struct bpf_iter_task_vma *it,
> @@ -840,19 +879,33 @@ __bpf_kfunc int bpf_iter_task_vma_new(struct bpf_iter_task_vma *it,
> goto err_cleanup_iter;
> }
>
> - /* kit->data->work == NULL is valid after bpf_mmap_unlock_get_irq_work */
> + /*
> + * Both mmap_lock and mmput irq_work slots must be free for _destroy().
> + * kit->data->work == NULL is valid after bpf_mmap_unlock_get_irq_work
> + */
> irq_work_busy = bpf_mmap_unlock_get_irq_work(&kit->data->work);
> - if (irq_work_busy || !mmap_read_trylock(kit->data->mm)) {
> + if (irq_work_busy || bpf_iter_mmput_busy()) {
see my reply on previous patch, but why do we *have* to do the per-CPU
stuff here? iterator has its own state, we can have our own irq_work
node there. Simple and easy, what's the downside?
> err = -EBUSY;
> goto err_cleanup_iter;
> }
>
[...]
^ permalink raw reply [flat|nested] 17+ messages in thread
* [PATCH bpf v2 3/4] bpf: switch task_vma iterator from mmap_lock to per-VMA locks
2026-03-09 15:54 [PATCH bpf v2 0/4] bpf: fix and improve open-coded task_vma iterator Puranjay Mohan
2026-03-09 15:54 ` [PATCH bpf v2 1/4] bpf: rename mmap_unlock_irq_work to bpf_iter_mm_irq_work Puranjay Mohan
2026-03-09 15:54 ` [PATCH bpf v2 2/4] bpf: fix mm lifecycle in open-coded task_vma iterator Puranjay Mohan
@ 2026-03-09 15:54 ` Puranjay Mohan
2026-03-09 16:33 ` bot+bpf-ci
2026-03-11 19:00 ` Andrii Nakryiko
2026-03-09 15:54 ` [PATCH bpf v2 4/4] bpf: return VMA snapshot from task_vma iterator Puranjay Mohan
3 siblings, 2 replies; 17+ messages in thread
From: Puranjay Mohan @ 2026-03-09 15:54 UTC (permalink / raw)
To: bpf
Cc: Puranjay Mohan, Puranjay Mohan, Alexei Starovoitov,
Andrii Nakryiko, Daniel Borkmann, Martin KaFai Lau,
Eduard Zingerman, Kumar Kartikeya Dwivedi, Mykyta Yatsenko,
kernel-team, Mykyta Yatsenko
The open-coded task_vma iterator holds mmap_lock for the entire duration
of iteration, increasing contention on this highly contended lock.
Switch to per-VMA locking. Find the next VMA via an RCU-protected maple
tree walk and lock it with lock_vma_under_rcu(). lock_next_vma() is not
used because its fallback takes mmap_read_lock(), and the iterator must
work in non-sleepable contexts.
Between the RCU walk and the lock, the VMA may be removed, shrunk, or
write-locked. On failure, advance past it using vm_end from the RCU
walk. Because the VMA slab is SLAB_TYPESAFE_BY_RCU, vm_end may be
stale; fall back to PAGE_SIZE advancement when it does not make forward
progress. VMAs inserted in gaps between iterations are not detected.
CONFIG_PER_VMA_LOCK is required; return -EOPNOTSUPP without it.
Signed-off-by: Puranjay Mohan <puranjay@kernel.org>
Acked-by: Mykyta Yatsenko <yatsenko@meta.com>
---
kernel/bpf/task_iter.c | 91 +++++++++++++++++++++++++++++++++---------
1 file changed, 72 insertions(+), 19 deletions(-)
diff --git a/kernel/bpf/task_iter.c b/kernel/bpf/task_iter.c
index e8efc9e1f602..e20c85e06afa 100644
--- a/kernel/bpf/task_iter.c
+++ b/kernel/bpf/task_iter.c
@@ -9,6 +9,7 @@
#include <linux/bpf_mem_alloc.h>
#include <linux/btf_ids.h>
#include <linux/mm_types.h>
+#include <linux/mmap_lock.h>
#include <linux/sched/mm.h>
#include "mmap_unlock_work.h"
@@ -798,8 +799,8 @@ const struct bpf_func_proto bpf_find_vma_proto = {
struct bpf_iter_task_vma_kern_data {
struct task_struct *task;
struct mm_struct *mm;
- struct bpf_iter_mm_irq_work *work;
- struct vma_iterator vmi;
+ struct vm_area_struct *locked_vma;
+ u64 last_addr;
};
struct bpf_iter_task_vma {
@@ -858,12 +859,16 @@ __bpf_kfunc int bpf_iter_task_vma_new(struct bpf_iter_task_vma *it,
struct task_struct *task, u64 addr)
{
struct bpf_iter_task_vma_kern *kit = (void *)it;
- bool irq_work_busy = false;
int err;
BUILD_BUG_ON(sizeof(struct bpf_iter_task_vma_kern) != sizeof(struct bpf_iter_task_vma));
BUILD_BUG_ON(__alignof__(struct bpf_iter_task_vma_kern) != __alignof__(struct bpf_iter_task_vma));
+ if (!IS_ENABLED(CONFIG_PER_VMA_LOCK)) {
+ kit->data = NULL;
+ return -EOPNOTSUPP;
+ }
+
/* is_iter_reg_valid_uninit guarantees that kit hasn't been initialized
* before, so non-NULL kit->data doesn't point to previously
* bpf_mem_alloc'd bpf_iter_task_vma_kern_data
@@ -879,12 +884,8 @@ __bpf_kfunc int bpf_iter_task_vma_new(struct bpf_iter_task_vma *it,
goto err_cleanup_iter;
}
- /*
- * Both mmap_lock and mmput irq_work slots must be free for _destroy().
- * kit->data->work == NULL is valid after bpf_mmap_unlock_get_irq_work
- */
- irq_work_busy = bpf_mmap_unlock_get_irq_work(&kit->data->work);
- if (irq_work_busy || bpf_iter_mmput_busy()) {
+ /* Ensure the mmput irq_work slot is free for _destroy(). */
+ if (bpf_iter_mmput_busy()) {
err = -EBUSY;
goto err_cleanup_iter;
}
@@ -894,16 +895,10 @@ __bpf_kfunc int bpf_iter_task_vma_new(struct bpf_iter_task_vma *it,
goto err_cleanup_iter;
}
- if (!mmap_read_trylock(kit->data->mm)) {
- err = -EBUSY;
- goto err_cleanup_mmget;
- }
-
- vma_iter_init(&kit->data->vmi, kit->data->mm, addr);
+ kit->data->locked_vma = NULL;
+ kit->data->last_addr = addr;
return 0;
-err_cleanup_mmget:
- bpf_iter_mmput(kit->data->mm);
err_cleanup_iter:
put_task_struct(kit->data->task);
bpf_mem_free(&bpf_global_ma, kit->data);
@@ -912,13 +907,70 @@ __bpf_kfunc int bpf_iter_task_vma_new(struct bpf_iter_task_vma *it,
return err;
}
+/*
+ * Find and lock the next VMA at or after data->last_addr using an
+ * RCU-protected maple tree walk followed by lock_vma_under_rcu().
+ * On failure or if the VMA changed, advance past it using vm_end from
+ * the RCU walk. SLAB_TYPESAFE_BY_RCU can make vm_end stale, so fall
+ * back to PAGE_SIZE advancement to guarantee forward progress.
+ */
+static struct vm_area_struct *
+bpf_iter_task_vma_find_next(struct bpf_iter_task_vma_kern_data *data)
+{
+ struct vm_area_struct *vma;
+ struct vma_iterator vmi;
+ unsigned long next_addr, next_end;
+
+retry:
+ rcu_read_lock();
+ vma_iter_init(&vmi, data->mm, data->last_addr);
+ vma = vma_next(&vmi);
+ if (!vma) {
+ rcu_read_unlock();
+ return NULL;
+ }
+ next_addr = vma->vm_start;
+ next_end = vma->vm_end;
+ rcu_read_unlock();
+
+ vma = lock_vma_under_rcu(data->mm, next_addr);
+ if (!vma) {
+ if (next_end > data->last_addr)
+ data->last_addr = next_end;
+ else
+ data->last_addr += PAGE_SIZE;
+ goto retry;
+ }
+
+ if (unlikely(data->last_addr >= vma->vm_end)) {
+ data->last_addr = vma->vm_end;
+ vma_end_read(vma);
+ goto retry;
+ }
+
+ return vma;
+}
+
__bpf_kfunc struct vm_area_struct *bpf_iter_task_vma_next(struct bpf_iter_task_vma *it)
{
struct bpf_iter_task_vma_kern *kit = (void *)it;
+ struct vm_area_struct *vma;
if (!kit->data) /* bpf_iter_task_vma_new failed */
return NULL;
- return vma_next(&kit->data->vmi);
+
+ if (kit->data->locked_vma)
+ vma_end_read(kit->data->locked_vma);
+
+ vma = bpf_iter_task_vma_find_next(kit->data);
+ if (!vma) {
+ kit->data->locked_vma = NULL;
+ return NULL;
+ }
+
+ kit->data->locked_vma = vma;
+ kit->data->last_addr = vma->vm_end;
+ return vma;
}
__bpf_kfunc void bpf_iter_task_vma_destroy(struct bpf_iter_task_vma *it)
@@ -926,7 +978,8 @@ __bpf_kfunc void bpf_iter_task_vma_destroy(struct bpf_iter_task_vma *it)
struct bpf_iter_task_vma_kern *kit = (void *)it;
if (kit->data) {
- bpf_mmap_unlock_mm(kit->data->work, kit->data->mm);
+ if (kit->data->locked_vma)
+ vma_end_read(kit->data->locked_vma);
bpf_iter_mmput(kit->data->mm);
put_task_struct(kit->data->task);
bpf_mem_free(&bpf_global_ma, kit->data);
--
2.47.3
^ permalink raw reply related [flat|nested] 17+ messages in thread* Re: [PATCH bpf v2 3/4] bpf: switch task_vma iterator from mmap_lock to per-VMA locks
2026-03-09 15:54 ` [PATCH bpf v2 3/4] bpf: switch task_vma iterator from mmap_lock to per-VMA locks Puranjay Mohan
@ 2026-03-09 16:33 ` bot+bpf-ci
2026-03-11 19:00 ` Andrii Nakryiko
1 sibling, 0 replies; 17+ messages in thread
From: bot+bpf-ci @ 2026-03-09 16:33 UTC (permalink / raw)
To: puranjay, bpf
Cc: puranjay, puranjay12, ast, andrii, daniel, martin.lau, eddyz87,
memxor, mykyta.yatsenko5, kernel-team, yatsenko, ast, andrii,
daniel, martin.lau, eddyz87, yonghong.song, clm, ihor.solodrai
[-- Attachment #1: Type: text/plain, Size: 2512 bytes --]
> diff --git a/kernel/bpf/task_iter.c b/kernel/bpf/task_iter.c
> --- a/kernel/bpf/task_iter.c
> +++ b/kernel/bpf/task_iter.c
[ ... ]
> +static struct vm_area_struct *
> +bpf_iter_task_vma_find_next(struct bpf_iter_task_vma_kern_data *data)
> +{
> + struct vm_area_struct *vma;
> + struct vma_iterator vmi;
> + unsigned long next_addr, next_end;
> +
> +retry:
> + rcu_read_lock();
> + vma_iter_init(&vmi, data->mm, data->last_addr);
> + vma = vma_next(&vmi);
> + if (!vma) {
> + rcu_read_unlock();
> + return NULL;
> + }
> + next_addr = vma->vm_start;
> + next_end = vma->vm_end;
> + rcu_read_unlock();
> +
> + vma = lock_vma_under_rcu(data->mm, next_addr);
> + if (!vma) {
> + if (next_end > data->last_addr)
> + data->last_addr = next_end;
> + else
> + data->last_addr += PAGE_SIZE;
> + goto retry;
> + }
> +
> + if (unlikely(data->last_addr >= vma->vm_end)) {
> + data->last_addr = vma->vm_end;
> + vma_end_read(vma);
> + goto retry;
> + }
When lock_vma_under_rcu() fails above, the code correctly ensures
forward progress: it uses next_end when that advances past
last_addr, otherwise falls back to PAGE_SIZE.
However in this second branch, when lock_vma_under_rcu() succeeds
but last_addr >= vma->vm_end, setting last_addr = vma->vm_end can
move the iterator backward. Since the condition guarantees
vma->vm_end <= last_addr, the assignment either keeps last_addr
the same or reduces it.
This can happen when a VMA is concurrently shrunk between the RCU
walk and the lock_vma_under_rcu() call:
last_addr = 0x5000 (from prior iteration)
RCU walk finds VMA [0x3000, 0x8000), next_addr = 0x3000
VMA concurrently shrunk to [0x3000, 0x4000)
lock_vma_under_rcu(mm, 0x3000) locks [0x3000, 0x4000)
last_addr (0x5000) >= vm_end (0x4000)
last_addr = 0x4000 -- moved backward
The backward movement causes the iterator to re-visit VMAs that
were already returned to the BPF program.
Could this use the same PAGE_SIZE advancement that the
lock_vma_under_rcu() failure path uses? For example:
if (unlikely(data->last_addr >= vma->vm_end)) {
data->last_addr += PAGE_SIZE;
vma_end_read(vma);
goto retry;
}
This would maintain the forward progress guarantee described in
the commit message.
> +
> + return vma;
> +}
[ ... ]
---
AI reviewed your patch. Please fix the bug or email reply why it's not a bug.
See: https://github.com/kernel-patches/vmtest/blob/master/ci/claude/README.md
CI run summary: https://github.com/kernel-patches/bpf/actions/runs/22862450537
^ permalink raw reply [flat|nested] 17+ messages in thread* Re: [PATCH bpf v2 3/4] bpf: switch task_vma iterator from mmap_lock to per-VMA locks
2026-03-09 15:54 ` [PATCH bpf v2 3/4] bpf: switch task_vma iterator from mmap_lock to per-VMA locks Puranjay Mohan
2026-03-09 16:33 ` bot+bpf-ci
@ 2026-03-11 19:00 ` Andrii Nakryiko
2026-03-11 19:25 ` Puranjay Mohan
1 sibling, 1 reply; 17+ messages in thread
From: Andrii Nakryiko @ 2026-03-11 19:00 UTC (permalink / raw)
To: Puranjay Mohan
Cc: bpf, Puranjay Mohan, Alexei Starovoitov, Andrii Nakryiko,
Daniel Borkmann, Martin KaFai Lau, Eduard Zingerman,
Kumar Kartikeya Dwivedi, Mykyta Yatsenko, kernel-team,
Mykyta Yatsenko
On Mon, Mar 9, 2026 at 8:55 AM Puranjay Mohan <puranjay@kernel.org> wrote:
>
> The open-coded task_vma iterator holds mmap_lock for the entire duration
> of iteration, increasing contention on this highly contended lock.
>
> Switch to per-VMA locking. Find the next VMA via an RCU-protected maple
> tree walk and lock it with lock_vma_under_rcu(). lock_next_vma() is not
> used because its fallback takes mmap_read_lock(), and the iterator must
> work in non-sleepable contexts.
>
> Between the RCU walk and the lock, the VMA may be removed, shrunk, or
> write-locked. On failure, advance past it using vm_end from the RCU
> walk. Because the VMA slab is SLAB_TYPESAFE_BY_RCU, vm_end may be
> stale; fall back to PAGE_SIZE advancement when it does not make forward
> progress. VMAs inserted in gaps between iterations are not detected.
>
> CONFIG_PER_VMA_LOCK is required; return -EOPNOTSUPP without it.
>
> Signed-off-by: Puranjay Mohan <puranjay@kernel.org>
> Acked-by: Mykyta Yatsenko <yatsenko@meta.com>
> ---
> kernel/bpf/task_iter.c | 91 +++++++++++++++++++++++++++++++++---------
> 1 file changed, 72 insertions(+), 19 deletions(-)
>
> diff --git a/kernel/bpf/task_iter.c b/kernel/bpf/task_iter.c
> index e8efc9e1f602..e20c85e06afa 100644
> --- a/kernel/bpf/task_iter.c
> +++ b/kernel/bpf/task_iter.c
> @@ -9,6 +9,7 @@
> #include <linux/bpf_mem_alloc.h>
> #include <linux/btf_ids.h>
> #include <linux/mm_types.h>
> +#include <linux/mmap_lock.h>
> #include <linux/sched/mm.h>
> #include "mmap_unlock_work.h"
>
> @@ -798,8 +799,8 @@ const struct bpf_func_proto bpf_find_vma_proto = {
> struct bpf_iter_task_vma_kern_data {
> struct task_struct *task;
> struct mm_struct *mm;
> - struct bpf_iter_mm_irq_work *work;
> - struct vma_iterator vmi;
> + struct vm_area_struct *locked_vma;
> + u64 last_addr;
> };
>
> struct bpf_iter_task_vma {
> @@ -858,12 +859,16 @@ __bpf_kfunc int bpf_iter_task_vma_new(struct bpf_iter_task_vma *it,
> struct task_struct *task, u64 addr)
> {
> struct bpf_iter_task_vma_kern *kit = (void *)it;
> - bool irq_work_busy = false;
> int err;
>
> BUILD_BUG_ON(sizeof(struct bpf_iter_task_vma_kern) != sizeof(struct bpf_iter_task_vma));
> BUILD_BUG_ON(__alignof__(struct bpf_iter_task_vma_kern) != __alignof__(struct bpf_iter_task_vma));
>
> + if (!IS_ENABLED(CONFIG_PER_VMA_LOCK)) {
> + kit->data = NULL;
> + return -EOPNOTSUPP;
> + }
> +
> /* is_iter_reg_valid_uninit guarantees that kit hasn't been initialized
> * before, so non-NULL kit->data doesn't point to previously
> * bpf_mem_alloc'd bpf_iter_task_vma_kern_data
> @@ -879,12 +884,8 @@ __bpf_kfunc int bpf_iter_task_vma_new(struct bpf_iter_task_vma *it,
> goto err_cleanup_iter;
> }
>
> - /*
> - * Both mmap_lock and mmput irq_work slots must be free for _destroy().
> - * kit->data->work == NULL is valid after bpf_mmap_unlock_get_irq_work
> - */
> - irq_work_busy = bpf_mmap_unlock_get_irq_work(&kit->data->work);
> - if (irq_work_busy || bpf_iter_mmput_busy()) {
> + /* Ensure the mmput irq_work slot is free for _destroy(). */
> + if (bpf_iter_mmput_busy()) {
> err = -EBUSY;
> goto err_cleanup_iter;
> }
> @@ -894,16 +895,10 @@ __bpf_kfunc int bpf_iter_task_vma_new(struct bpf_iter_task_vma *it,
> goto err_cleanup_iter;
> }
>
> - if (!mmap_read_trylock(kit->data->mm)) {
> - err = -EBUSY;
> - goto err_cleanup_mmget;
> - }
> -
> - vma_iter_init(&kit->data->vmi, kit->data->mm, addr);
> + kit->data->locked_vma = NULL;
> + kit->data->last_addr = addr;
> return 0;
>
> -err_cleanup_mmget:
> - bpf_iter_mmput(kit->data->mm);
> err_cleanup_iter:
> put_task_struct(kit->data->task);
> bpf_mem_free(&bpf_global_ma, kit->data);
> @@ -912,13 +907,70 @@ __bpf_kfunc int bpf_iter_task_vma_new(struct bpf_iter_task_vma *it,
> return err;
> }
>
> +/*
> + * Find and lock the next VMA at or after data->last_addr using an
> + * RCU-protected maple tree walk followed by lock_vma_under_rcu().
> + * On failure or if the VMA changed, advance past it using vm_end from
> + * the RCU walk. SLAB_TYPESAFE_BY_RCU can make vm_end stale, so fall
> + * back to PAGE_SIZE advancement to guarantee forward progress.
> + */
> +static struct vm_area_struct *
> +bpf_iter_task_vma_find_next(struct bpf_iter_task_vma_kern_data *data)
> +{
> + struct vm_area_struct *vma;
> + struct vma_iterator vmi;
> + unsigned long next_addr, next_end;
> +
> +retry:
> + rcu_read_lock();
> + vma_iter_init(&vmi, data->mm, data->last_addr);
> + vma = vma_next(&vmi);
> + if (!vma) {
> + rcu_read_unlock();
> + return NULL;
> + }
> + next_addr = vma->vm_start;
> + next_end = vma->vm_end;
> + rcu_read_unlock();
are we doing this whole vma_iter_init+vma_next just to differentiate
between lock_vma_under_rcu() return NULL because a) there is no VMA
anymore and b) VMA is/was there, but we failed to lock it? Is that
right? If so, let's add some explicit comment explaining that aspect
specifically?
better still would be to change lock_vma_under_rcu to return
ERR_PTR(), maybe mm folks won't be against that?..
> +
> + vma = lock_vma_under_rcu(data->mm, next_addr);
> + if (!vma) {
> + if (next_end > data->last_addr)
> + data->last_addr = next_end;
> + else
> + data->last_addr += PAGE_SIZE;
this feels very arbitrary and kind of wrong. if we are unlucky to look
up vma that was moved to earlier, next retry won't look it up anymore,
and we'll get better vm_end. So instead of arbitrary + PAGE_SIZE,
let's just keep last_addr as is and retry?
and speaking of, we should still pass data->last_addr into
lock_vma_under_rcu(), IMO, given our vma_next() is speculative, no?
> + goto retry;
> + }
> +
> + if (unlikely(data->last_addr >= vma->vm_end)) {
> + data->last_addr = vma->vm_end;
> + vma_end_read(vma);
> + goto retry;
> + }
> +
> + return vma;
> +}
> +
> __bpf_kfunc struct vm_area_struct *bpf_iter_task_vma_next(struct bpf_iter_task_vma *it)
> {
> struct bpf_iter_task_vma_kern *kit = (void *)it;
> + struct vm_area_struct *vma;
>
> if (!kit->data) /* bpf_iter_task_vma_new failed */
> return NULL;
> - return vma_next(&kit->data->vmi);
> +
> + if (kit->data->locked_vma)
> + vma_end_read(kit->data->locked_vma);
clear kit->data->locked_vma here
> +
> + vma = bpf_iter_task_vma_find_next(kit->data);
> + if (!vma) {
> + kit->data->locked_vma = NULL;
instead of here? what are we microoptimizing for?
> + return NULL;
> + }
> +
> + kit->data->locked_vma = vma;
> + kit->data->last_addr = vma->vm_end;
nit: this is "next addr", not "last addr"
> + return vma;
> }
>
> __bpf_kfunc void bpf_iter_task_vma_destroy(struct bpf_iter_task_vma *it)
> @@ -926,7 +978,8 @@ __bpf_kfunc void bpf_iter_task_vma_destroy(struct bpf_iter_task_vma *it)
> struct bpf_iter_task_vma_kern *kit = (void *)it;
>
> if (kit->data) {
> - bpf_mmap_unlock_mm(kit->data->work, kit->data->mm);
> + if (kit->data->locked_vma)
> + vma_end_read(kit->data->locked_vma);
> bpf_iter_mmput(kit->data->mm);
> put_task_struct(kit->data->task);
> bpf_mem_free(&bpf_global_ma, kit->data);
> --
> 2.47.3
>
^ permalink raw reply [flat|nested] 17+ messages in thread* Re: [PATCH bpf v2 3/4] bpf: switch task_vma iterator from mmap_lock to per-VMA locks
2026-03-11 19:00 ` Andrii Nakryiko
@ 2026-03-11 19:25 ` Puranjay Mohan
2026-03-11 23:54 ` Andrii Nakryiko
0 siblings, 1 reply; 17+ messages in thread
From: Puranjay Mohan @ 2026-03-11 19:25 UTC (permalink / raw)
To: Andrii Nakryiko
Cc: bpf, Alexei Starovoitov, Andrii Nakryiko, Daniel Borkmann,
Martin KaFai Lau, Eduard Zingerman, Kumar Kartikeya Dwivedi,
Mykyta Yatsenko, kernel-team, Mykyta Yatsenko
On Wed, Mar 11, 2026 at 7:00 PM Andrii Nakryiko
<andrii.nakryiko@gmail.com> wrote:
>
> On Mon, Mar 9, 2026 at 8:55 AM Puranjay Mohan <puranjay@kernel.org> wrote:
> >
> > The open-coded task_vma iterator holds mmap_lock for the entire duration
> > of iteration, increasing contention on this highly contended lock.
> >
> > Switch to per-VMA locking. Find the next VMA via an RCU-protected maple
> > tree walk and lock it with lock_vma_under_rcu(). lock_next_vma() is not
> > used because its fallback takes mmap_read_lock(), and the iterator must
> > work in non-sleepable contexts.
> >
> > Between the RCU walk and the lock, the VMA may be removed, shrunk, or
> > write-locked. On failure, advance past it using vm_end from the RCU
> > walk. Because the VMA slab is SLAB_TYPESAFE_BY_RCU, vm_end may be
> > stale; fall back to PAGE_SIZE advancement when it does not make forward
> > progress. VMAs inserted in gaps between iterations are not detected.
> >
> > CONFIG_PER_VMA_LOCK is required; return -EOPNOTSUPP without it.
> >
> > Signed-off-by: Puranjay Mohan <puranjay@kernel.org>
> > Acked-by: Mykyta Yatsenko <yatsenko@meta.com>
> > ---
> > kernel/bpf/task_iter.c | 91 +++++++++++++++++++++++++++++++++---------
> > 1 file changed, 72 insertions(+), 19 deletions(-)
> >
> > diff --git a/kernel/bpf/task_iter.c b/kernel/bpf/task_iter.c
> > index e8efc9e1f602..e20c85e06afa 100644
> > --- a/kernel/bpf/task_iter.c
> > +++ b/kernel/bpf/task_iter.c
> > @@ -9,6 +9,7 @@
> > #include <linux/bpf_mem_alloc.h>
> > #include <linux/btf_ids.h>
> > #include <linux/mm_types.h>
> > +#include <linux/mmap_lock.h>
> > #include <linux/sched/mm.h>
> > #include "mmap_unlock_work.h"
> >
> > @@ -798,8 +799,8 @@ const struct bpf_func_proto bpf_find_vma_proto = {
> > struct bpf_iter_task_vma_kern_data {
> > struct task_struct *task;
> > struct mm_struct *mm;
> > - struct bpf_iter_mm_irq_work *work;
> > - struct vma_iterator vmi;
> > + struct vm_area_struct *locked_vma;
> > + u64 last_addr;
> > };
> >
> > struct bpf_iter_task_vma {
> > @@ -858,12 +859,16 @@ __bpf_kfunc int bpf_iter_task_vma_new(struct bpf_iter_task_vma *it,
> > struct task_struct *task, u64 addr)
> > {
> > struct bpf_iter_task_vma_kern *kit = (void *)it;
> > - bool irq_work_busy = false;
> > int err;
> >
> > BUILD_BUG_ON(sizeof(struct bpf_iter_task_vma_kern) != sizeof(struct bpf_iter_task_vma));
> > BUILD_BUG_ON(__alignof__(struct bpf_iter_task_vma_kern) != __alignof__(struct bpf_iter_task_vma));
> >
> > + if (!IS_ENABLED(CONFIG_PER_VMA_LOCK)) {
> > + kit->data = NULL;
> > + return -EOPNOTSUPP;
> > + }
> > +
> > /* is_iter_reg_valid_uninit guarantees that kit hasn't been initialized
> > * before, so non-NULL kit->data doesn't point to previously
> > * bpf_mem_alloc'd bpf_iter_task_vma_kern_data
> > @@ -879,12 +884,8 @@ __bpf_kfunc int bpf_iter_task_vma_new(struct bpf_iter_task_vma *it,
> > goto err_cleanup_iter;
> > }
> >
> > - /*
> > - * Both mmap_lock and mmput irq_work slots must be free for _destroy().
> > - * kit->data->work == NULL is valid after bpf_mmap_unlock_get_irq_work
> > - */
> > - irq_work_busy = bpf_mmap_unlock_get_irq_work(&kit->data->work);
> > - if (irq_work_busy || bpf_iter_mmput_busy()) {
> > + /* Ensure the mmput irq_work slot is free for _destroy(). */
> > + if (bpf_iter_mmput_busy()) {
> > err = -EBUSY;
> > goto err_cleanup_iter;
> > }
> > @@ -894,16 +895,10 @@ __bpf_kfunc int bpf_iter_task_vma_new(struct bpf_iter_task_vma *it,
> > goto err_cleanup_iter;
> > }
> >
> > - if (!mmap_read_trylock(kit->data->mm)) {
> > - err = -EBUSY;
> > - goto err_cleanup_mmget;
> > - }
> > -
> > - vma_iter_init(&kit->data->vmi, kit->data->mm, addr);
> > + kit->data->locked_vma = NULL;
> > + kit->data->last_addr = addr;
> > return 0;
> >
> > -err_cleanup_mmget:
> > - bpf_iter_mmput(kit->data->mm);
> > err_cleanup_iter:
> > put_task_struct(kit->data->task);
> > bpf_mem_free(&bpf_global_ma, kit->data);
> > @@ -912,13 +907,70 @@ __bpf_kfunc int bpf_iter_task_vma_new(struct bpf_iter_task_vma *it,
> > return err;
> > }
> >
> > +/*
> > + * Find and lock the next VMA at or after data->last_addr using an
> > + * RCU-protected maple tree walk followed by lock_vma_under_rcu().
> > + * On failure or if the VMA changed, advance past it using vm_end from
> > + * the RCU walk. SLAB_TYPESAFE_BY_RCU can make vm_end stale, so fall
> > + * back to PAGE_SIZE advancement to guarantee forward progress.
> > + */
> > +static struct vm_area_struct *
> > +bpf_iter_task_vma_find_next(struct bpf_iter_task_vma_kern_data *data)
> > +{
> > + struct vm_area_struct *vma;
> > + struct vma_iterator vmi;
> > + unsigned long next_addr, next_end;
> > +
> > +retry:
> > + rcu_read_lock();
> > + vma_iter_init(&vmi, data->mm, data->last_addr);
> > + vma = vma_next(&vmi);
> > + if (!vma) {
> > + rcu_read_unlock();
> > + return NULL;
> > + }
> > + next_addr = vma->vm_start;
> > + next_end = vma->vm_end;
> > + rcu_read_unlock();
>
> are we doing this whole vma_iter_init+vma_next just to differentiate
> between lock_vma_under_rcu() return NULL because a) there is no VMA
> anymore and b) VMA is/was there, but we failed to lock it? Is that
> right? If so, let's add some explicit comment explaining that aspect
> specifically?
>
> better still would be to change lock_vma_under_rcu to return
> ERR_PTR(), maybe mm folks won't be against that?..
lock_vma_under_rcu(mm, addr) uses mas_walk() internally which only
finds a VMA that contains addr (i.e., vm_start <= addr < vm_end). If
addr falls in a gap between VMAs, it returns NULL. It cannot iterate
forward to find the next VMA. The RCU walk with vma_next() (which uses
mas_find()) finds the first VMA at or after last_addr, bridging gaps.
This is the reason we have to split it into two parts. We could have
used lock_next_vma() but it fallbacks to mmap_lock. We could create
new helpers in the mm/ that don't fallback but return NULL, but I
wanted the changes to stay within BPF code.
> > +
> > + vma = lock_vma_under_rcu(data->mm, next_addr);
> > + if (!vma) {
> > + if (next_end > data->last_addr)
> > + data->last_addr = next_end;
> > + else
> > + data->last_addr += PAGE_SIZE;
>
> this feels very arbitrary and kind of wrong. if we are unlucky to look
> up vma that was moved to earlier, next retry won't look it up anymore,
> and we'll get better vm_end. So instead of arbitrary + PAGE_SIZE,
> let's just keep last_addr as is and retry?
There's no guarantee the stale maple tree entry is gone on the next
retry. If the concurrent writer hasn't completed the RCU-visible tree
update, we'd see the same stale entry and spin without progress. The
PAGE_SIZE fallback provides a hard forward-progress guarantee for this
case.
> and speaking of, we should still pass data->last_addr into
> lock_vma_under_rcu(), IMO, given our vma_next() is speculative, no?
>
As explained above, the reason for splitting is not error checking,
can't pass data->last_addr to lock_vma_under_rcu() because
data->last_addr could be in a gap.
> > + goto retry;
> > + }
> > +
> > + if (unlikely(data->last_addr >= vma->vm_end)) {
> > + data->last_addr = vma->vm_end;
> > + vma_end_read(vma);
> > + goto retry;
> > + }
> > +
> > + return vma;
> > +}
> > +
> > __bpf_kfunc struct vm_area_struct *bpf_iter_task_vma_next(struct bpf_iter_task_vma *it)
> > {
> > struct bpf_iter_task_vma_kern *kit = (void *)it;
> > + struct vm_area_struct *vma;
> >
> > if (!kit->data) /* bpf_iter_task_vma_new failed */
> > return NULL;
> > - return vma_next(&kit->data->vmi);
> > +
> > + if (kit->data->locked_vma)
> > + vma_end_read(kit->data->locked_vma);
>
> clear kit->data->locked_vma here
>
> > +
> > + vma = bpf_iter_task_vma_find_next(kit->data);
> > + if (!vma) {
> > + kit->data->locked_vma = NULL;
>
> instead of here? what are we microoptimizing for?
Will do in next version.
> > + return NULL;
> > + }
> > +
> > + kit->data->locked_vma = vma;
> > + kit->data->last_addr = vma->vm_end;
>
> nit: this is "next addr", not "last addr"
Will change.
>
> > + return vma;
> > }
> >
> > __bpf_kfunc void bpf_iter_task_vma_destroy(struct bpf_iter_task_vma *it)
> > @@ -926,7 +978,8 @@ __bpf_kfunc void bpf_iter_task_vma_destroy(struct bpf_iter_task_vma *it)
> > struct bpf_iter_task_vma_kern *kit = (void *)it;
> >
> > if (kit->data) {
> > - bpf_mmap_unlock_mm(kit->data->work, kit->data->mm);
> > + if (kit->data->locked_vma)
> > + vma_end_read(kit->data->locked_vma);
> > bpf_iter_mmput(kit->data->mm);
> > put_task_struct(kit->data->task);
> > bpf_mem_free(&bpf_global_ma, kit->data);
> > --
> > 2.47.3
> >
^ permalink raw reply [flat|nested] 17+ messages in thread* Re: [PATCH bpf v2 3/4] bpf: switch task_vma iterator from mmap_lock to per-VMA locks
2026-03-11 19:25 ` Puranjay Mohan
@ 2026-03-11 23:54 ` Andrii Nakryiko
0 siblings, 0 replies; 17+ messages in thread
From: Andrii Nakryiko @ 2026-03-11 23:54 UTC (permalink / raw)
To: Puranjay Mohan
Cc: bpf, Alexei Starovoitov, Andrii Nakryiko, Daniel Borkmann,
Martin KaFai Lau, Eduard Zingerman, Kumar Kartikeya Dwivedi,
Mykyta Yatsenko, kernel-team, Mykyta Yatsenko
On Wed, Mar 11, 2026 at 12:25 PM Puranjay Mohan <puranjay12@gmail.com> wrote:
>
> On Wed, Mar 11, 2026 at 7:00 PM Andrii Nakryiko
> <andrii.nakryiko@gmail.com> wrote:
> >
> > On Mon, Mar 9, 2026 at 8:55 AM Puranjay Mohan <puranjay@kernel.org> wrote:
> > >
> > > The open-coded task_vma iterator holds mmap_lock for the entire duration
> > > of iteration, increasing contention on this highly contended lock.
> > >
> > > Switch to per-VMA locking. Find the next VMA via an RCU-protected maple
> > > tree walk and lock it with lock_vma_under_rcu(). lock_next_vma() is not
> > > used because its fallback takes mmap_read_lock(), and the iterator must
> > > work in non-sleepable contexts.
> > >
> > > Between the RCU walk and the lock, the VMA may be removed, shrunk, or
> > > write-locked. On failure, advance past it using vm_end from the RCU
> > > walk. Because the VMA slab is SLAB_TYPESAFE_BY_RCU, vm_end may be
> > > stale; fall back to PAGE_SIZE advancement when it does not make forward
> > > progress. VMAs inserted in gaps between iterations are not detected.
> > >
> > > CONFIG_PER_VMA_LOCK is required; return -EOPNOTSUPP without it.
> > >
> > > Signed-off-by: Puranjay Mohan <puranjay@kernel.org>
> > > Acked-by: Mykyta Yatsenko <yatsenko@meta.com>
> > > ---
> > > kernel/bpf/task_iter.c | 91 +++++++++++++++++++++++++++++++++---------
> > > 1 file changed, 72 insertions(+), 19 deletions(-)
> > >
> > > diff --git a/kernel/bpf/task_iter.c b/kernel/bpf/task_iter.c
> > > index e8efc9e1f602..e20c85e06afa 100644
> > > --- a/kernel/bpf/task_iter.c
> > > +++ b/kernel/bpf/task_iter.c
> > > @@ -9,6 +9,7 @@
> > > #include <linux/bpf_mem_alloc.h>
> > > #include <linux/btf_ids.h>
> > > #include <linux/mm_types.h>
> > > +#include <linux/mmap_lock.h>
> > > #include <linux/sched/mm.h>
> > > #include "mmap_unlock_work.h"
> > >
> > > @@ -798,8 +799,8 @@ const struct bpf_func_proto bpf_find_vma_proto = {
> > > struct bpf_iter_task_vma_kern_data {
> > > struct task_struct *task;
> > > struct mm_struct *mm;
> > > - struct bpf_iter_mm_irq_work *work;
> > > - struct vma_iterator vmi;
> > > + struct vm_area_struct *locked_vma;
> > > + u64 last_addr;
> > > };
> > >
> > > struct bpf_iter_task_vma {
> > > @@ -858,12 +859,16 @@ __bpf_kfunc int bpf_iter_task_vma_new(struct bpf_iter_task_vma *it,
> > > struct task_struct *task, u64 addr)
> > > {
> > > struct bpf_iter_task_vma_kern *kit = (void *)it;
> > > - bool irq_work_busy = false;
> > > int err;
> > >
> > > BUILD_BUG_ON(sizeof(struct bpf_iter_task_vma_kern) != sizeof(struct bpf_iter_task_vma));
> > > BUILD_BUG_ON(__alignof__(struct bpf_iter_task_vma_kern) != __alignof__(struct bpf_iter_task_vma));
> > >
> > > + if (!IS_ENABLED(CONFIG_PER_VMA_LOCK)) {
> > > + kit->data = NULL;
> > > + return -EOPNOTSUPP;
> > > + }
> > > +
> > > /* is_iter_reg_valid_uninit guarantees that kit hasn't been initialized
> > > * before, so non-NULL kit->data doesn't point to previously
> > > * bpf_mem_alloc'd bpf_iter_task_vma_kern_data
> > > @@ -879,12 +884,8 @@ __bpf_kfunc int bpf_iter_task_vma_new(struct bpf_iter_task_vma *it,
> > > goto err_cleanup_iter;
> > > }
> > >
> > > - /*
> > > - * Both mmap_lock and mmput irq_work slots must be free for _destroy().
> > > - * kit->data->work == NULL is valid after bpf_mmap_unlock_get_irq_work
> > > - */
> > > - irq_work_busy = bpf_mmap_unlock_get_irq_work(&kit->data->work);
> > > - if (irq_work_busy || bpf_iter_mmput_busy()) {
> > > + /* Ensure the mmput irq_work slot is free for _destroy(). */
> > > + if (bpf_iter_mmput_busy()) {
> > > err = -EBUSY;
> > > goto err_cleanup_iter;
> > > }
> > > @@ -894,16 +895,10 @@ __bpf_kfunc int bpf_iter_task_vma_new(struct bpf_iter_task_vma *it,
> > > goto err_cleanup_iter;
> > > }
> > >
> > > - if (!mmap_read_trylock(kit->data->mm)) {
> > > - err = -EBUSY;
> > > - goto err_cleanup_mmget;
> > > - }
> > > -
> > > - vma_iter_init(&kit->data->vmi, kit->data->mm, addr);
> > > + kit->data->locked_vma = NULL;
> > > + kit->data->last_addr = addr;
> > > return 0;
> > >
> > > -err_cleanup_mmget:
> > > - bpf_iter_mmput(kit->data->mm);
> > > err_cleanup_iter:
> > > put_task_struct(kit->data->task);
> > > bpf_mem_free(&bpf_global_ma, kit->data);
> > > @@ -912,13 +907,70 @@ __bpf_kfunc int bpf_iter_task_vma_new(struct bpf_iter_task_vma *it,
> > > return err;
> > > }
> > >
> > > +/*
> > > + * Find and lock the next VMA at or after data->last_addr using an
> > > + * RCU-protected maple tree walk followed by lock_vma_under_rcu().
> > > + * On failure or if the VMA changed, advance past it using vm_end from
> > > + * the RCU walk. SLAB_TYPESAFE_BY_RCU can make vm_end stale, so fall
> > > + * back to PAGE_SIZE advancement to guarantee forward progress.
> > > + */
> > > +static struct vm_area_struct *
> > > +bpf_iter_task_vma_find_next(struct bpf_iter_task_vma_kern_data *data)
> > > +{
> > > + struct vm_area_struct *vma;
> > > + struct vma_iterator vmi;
> > > + unsigned long next_addr, next_end;
> > > +
> > > +retry:
> > > + rcu_read_lock();
> > > + vma_iter_init(&vmi, data->mm, data->last_addr);
> > > + vma = vma_next(&vmi);
> > > + if (!vma) {
> > > + rcu_read_unlock();
> > > + return NULL;
> > > + }
> > > + next_addr = vma->vm_start;
> > > + next_end = vma->vm_end;
> > > + rcu_read_unlock();
> >
> > are we doing this whole vma_iter_init+vma_next just to differentiate
> > between lock_vma_under_rcu() return NULL because a) there is no VMA
> > anymore and b) VMA is/was there, but we failed to lock it? Is that
> > right? If so, let's add some explicit comment explaining that aspect
> > specifically?
> >
> > better still would be to change lock_vma_under_rcu to return
> > ERR_PTR(), maybe mm folks won't be against that?..
>
> lock_vma_under_rcu(mm, addr) uses mas_walk() internally which only
> finds a VMA that contains addr (i.e., vm_start <= addr < vm_end). If
> addr falls in a gap between VMAs, it returns NULL. It cannot iterate
> forward to find the next VMA. The RCU walk with vma_next() (which uses
> mas_find()) finds the first VMA at or after last_addr, bridging gaps.
I see, makes sense. I missed this bit.
> This is the reason we have to split it into two parts. We could have
> used lock_next_vma() but it fallbacks to mmap_lock. We could create
> new helpers in the mm/ that don't fallback but return NULL, but I
> wanted the changes to stay within BPF code.
>
> > > +
> > > + vma = lock_vma_under_rcu(data->mm, next_addr);
> > > + if (!vma) {
> > > + if (next_end > data->last_addr)
> > > + data->last_addr = next_end;
> > > + else
> > > + data->last_addr += PAGE_SIZE;
> >
> > this feels very arbitrary and kind of wrong. if we are unlucky to look
> > up vma that was moved to earlier, next retry won't look it up anymore,
> > and we'll get better vm_end. So instead of arbitrary + PAGE_SIZE,
> > let's just keep last_addr as is and retry?
>
> There's no guarantee the stale maple tree entry is gone on the next
> retry. If the concurrent writer hasn't completed the RCU-visible tree
> update, we'd see the same stale entry and spin without progress. The
> PAGE_SIZE fallback provides a hard forward-progress guarantee for this
> case.
hm, ok
>
> > and speaking of, we should still pass data->last_addr into
> > lock_vma_under_rcu(), IMO, given our vma_next() is speculative, no?
> >
> As explained above, the reason for splitting is not error checking,
> can't pass data->last_addr to lock_vma_under_rcu() because
> data->last_addr could be in a gap.
>
>
[...]
^ permalink raw reply [flat|nested] 17+ messages in thread
* [PATCH bpf v2 4/4] bpf: return VMA snapshot from task_vma iterator
2026-03-09 15:54 [PATCH bpf v2 0/4] bpf: fix and improve open-coded task_vma iterator Puranjay Mohan
` (2 preceding siblings ...)
2026-03-09 15:54 ` [PATCH bpf v2 3/4] bpf: switch task_vma iterator from mmap_lock to per-VMA locks Puranjay Mohan
@ 2026-03-09 15:54 ` Puranjay Mohan
2026-03-09 17:11 ` Mykyta Yatsenko
2026-03-11 19:07 ` Andrii Nakryiko
3 siblings, 2 replies; 17+ messages in thread
From: Puranjay Mohan @ 2026-03-09 15:54 UTC (permalink / raw)
To: bpf
Cc: Puranjay Mohan, Puranjay Mohan, Alexei Starovoitov,
Andrii Nakryiko, Daniel Borkmann, Martin KaFai Lau,
Eduard Zingerman, Kumar Kartikeya Dwivedi, Mykyta Yatsenko,
kernel-team
Holding the per-VMA lock across the BPF program body creates a lock
ordering problem when helpers acquire locks that depend on mmap_lock:
vm_lock -> i_rwsem -> mmap_lock -> vm_lock
Snapshot VMA fields under the per-VMA lock in _next(), then drop the
lock before returning. The BPF program accesses only the snapshot.
Copy vm_start, vm_end, vm_flags, vm_pgoff, vm_page_prot, vm_file, and
vm_mm. vm_file is reference-counted with get_file() under the lock and
released via fput() on the next iteration or in _destroy(). vm_mm uses
the mm pointer already held via mmget().
Fixes: 4ac454682158 ("bpf: Introduce task_vma open-coded iterator kfuncs")
Signed-off-by: Puranjay Mohan <puranjay@kernel.org>
---
kernel/bpf/task_iter.c | 34 ++++++++++++++++++++++------------
1 file changed, 22 insertions(+), 12 deletions(-)
diff --git a/kernel/bpf/task_iter.c b/kernel/bpf/task_iter.c
index e20c85e06afa..f04d6e310fd3 100644
--- a/kernel/bpf/task_iter.c
+++ b/kernel/bpf/task_iter.c
@@ -799,7 +799,7 @@ const struct bpf_func_proto bpf_find_vma_proto = {
struct bpf_iter_task_vma_kern_data {
struct task_struct *task;
struct mm_struct *mm;
- struct vm_area_struct *locked_vma;
+ struct vm_area_struct snapshot;
u64 last_addr;
};
@@ -895,8 +895,8 @@ __bpf_kfunc int bpf_iter_task_vma_new(struct bpf_iter_task_vma *it,
goto err_cleanup_iter;
}
- kit->data->locked_vma = NULL;
kit->data->last_addr = addr;
+ memset(&kit->data->snapshot, 0, sizeof(kit->data->snapshot));
return 0;
err_cleanup_iter:
@@ -954,23 +954,33 @@ bpf_iter_task_vma_find_next(struct bpf_iter_task_vma_kern_data *data)
__bpf_kfunc struct vm_area_struct *bpf_iter_task_vma_next(struct bpf_iter_task_vma *it)
{
struct bpf_iter_task_vma_kern *kit = (void *)it;
- struct vm_area_struct *vma;
+ struct vm_area_struct *snap, *vma;
if (!kit->data) /* bpf_iter_task_vma_new failed */
return NULL;
- if (kit->data->locked_vma)
- vma_end_read(kit->data->locked_vma);
+ snap = &kit->data->snapshot;
+
+ if (snap->vm_file) {
+ fput(snap->vm_file);
+ snap->vm_file = NULL;
+ }
vma = bpf_iter_task_vma_find_next(kit->data);
- if (!vma) {
- kit->data->locked_vma = NULL;
+ if (!vma)
return NULL;
- }
- kit->data->locked_vma = vma;
+ snap->vm_start = vma->vm_start;
+ snap->vm_end = vma->vm_end;
+ snap->vm_mm = kit->data->mm;
+ snap->vm_page_prot = vma->vm_page_prot;
+ snap->flags = vma->flags;
+ snap->vm_pgoff = vma->vm_pgoff;
+ snap->vm_file = vma->vm_file ? get_file(vma->vm_file) : NULL;
+
kit->data->last_addr = vma->vm_end;
- return vma;
+ vma_end_read(vma);
+ return snap;
}
__bpf_kfunc void bpf_iter_task_vma_destroy(struct bpf_iter_task_vma *it)
@@ -978,8 +988,8 @@ __bpf_kfunc void bpf_iter_task_vma_destroy(struct bpf_iter_task_vma *it)
struct bpf_iter_task_vma_kern *kit = (void *)it;
if (kit->data) {
- if (kit->data->locked_vma)
- vma_end_read(kit->data->locked_vma);
+ if (kit->data->snapshot.vm_file)
+ fput(kit->data->snapshot.vm_file);
bpf_iter_mmput(kit->data->mm);
put_task_struct(kit->data->task);
bpf_mem_free(&bpf_global_ma, kit->data);
--
2.47.3
^ permalink raw reply related [flat|nested] 17+ messages in thread* Re: [PATCH bpf v2 4/4] bpf: return VMA snapshot from task_vma iterator
2026-03-09 15:54 ` [PATCH bpf v2 4/4] bpf: return VMA snapshot from task_vma iterator Puranjay Mohan
@ 2026-03-09 17:11 ` Mykyta Yatsenko
2026-03-11 19:07 ` Andrii Nakryiko
1 sibling, 0 replies; 17+ messages in thread
From: Mykyta Yatsenko @ 2026-03-09 17:11 UTC (permalink / raw)
To: Puranjay Mohan, bpf
Cc: Puranjay Mohan, Puranjay Mohan, Alexei Starovoitov,
Andrii Nakryiko, Daniel Borkmann, Martin KaFai Lau,
Eduard Zingerman, Kumar Kartikeya Dwivedi, kernel-team
Puranjay Mohan <puranjay@kernel.org> writes:
> Holding the per-VMA lock across the BPF program body creates a lock
> ordering problem when helpers acquire locks that depend on mmap_lock:
>
> vm_lock -> i_rwsem -> mmap_lock -> vm_lock
>
> Snapshot VMA fields under the per-VMA lock in _next(), then drop the
> lock before returning. The BPF program accesses only the snapshot.
>
> Copy vm_start, vm_end, vm_flags, vm_pgoff, vm_page_prot, vm_file, and
> vm_mm. vm_file is reference-counted with get_file() under the lock and
> released via fput() on the next iteration or in _destroy(). vm_mm uses
> the mm pointer already held via mmget().
>
> Fixes: 4ac454682158 ("bpf: Introduce task_vma open-coded iterator kfuncs")
> Signed-off-by: Puranjay Mohan <puranjay@kernel.org>
> ---
> kernel/bpf/task_iter.c | 34 ++++++++++++++++++++++------------
> 1 file changed, 22 insertions(+), 12 deletions(-)
>
> diff --git a/kernel/bpf/task_iter.c b/kernel/bpf/task_iter.c
> index e20c85e06afa..f04d6e310fd3 100644
> --- a/kernel/bpf/task_iter.c
> +++ b/kernel/bpf/task_iter.c
> @@ -799,7 +799,7 @@ const struct bpf_func_proto bpf_find_vma_proto = {
> struct bpf_iter_task_vma_kern_data {
> struct task_struct *task;
> struct mm_struct *mm;
> - struct vm_area_struct *locked_vma;
> + struct vm_area_struct snapshot;
> u64 last_addr;
> };
>
> @@ -895,8 +895,8 @@ __bpf_kfunc int bpf_iter_task_vma_new(struct bpf_iter_task_vma *it,
> goto err_cleanup_iter;
> }
>
> - kit->data->locked_vma = NULL;
> kit->data->last_addr = addr;
> + memset(&kit->data->snapshot, 0, sizeof(kit->data->snapshot));
> return 0;
>
> err_cleanup_iter:
> @@ -954,23 +954,33 @@ bpf_iter_task_vma_find_next(struct bpf_iter_task_vma_kern_data *data)
> __bpf_kfunc struct vm_area_struct *bpf_iter_task_vma_next(struct bpf_iter_task_vma *it)
> {
> struct bpf_iter_task_vma_kern *kit = (void *)it;
> - struct vm_area_struct *vma;
> + struct vm_area_struct *snap, *vma;
>
> if (!kit->data) /* bpf_iter_task_vma_new failed */
> return NULL;
>
> - if (kit->data->locked_vma)
> - vma_end_read(kit->data->locked_vma);
> + snap = &kit->data->snapshot;
> +
> + if (snap->vm_file) {
> + fput(snap->vm_file);
> + snap->vm_file = NULL;
> + }
>
> vma = bpf_iter_task_vma_find_next(kit->data);
> - if (!vma) {
> - kit->data->locked_vma = NULL;
> + if (!vma)
> return NULL;
> - }
>
> - kit->data->locked_vma = vma;
> + snap->vm_start = vma->vm_start;
> + snap->vm_end = vma->vm_end;
> + snap->vm_mm = kit->data->mm;
> + snap->vm_page_prot = vma->vm_page_prot;
> + snap->flags = vma->flags;
It looks like there a supported way to copy flags: vm_flags_init() here.
> + snap->vm_pgoff = vma->vm_pgoff;
> + snap->vm_file = vma->vm_file ? get_file(vma->vm_file) : NULL;
> +
> kit->data->last_addr = vma->vm_end;
> - return vma;
> + vma_end_read(vma);
> + return snap;
> }
>
> __bpf_kfunc void bpf_iter_task_vma_destroy(struct bpf_iter_task_vma *it)
> @@ -978,8 +988,8 @@ __bpf_kfunc void bpf_iter_task_vma_destroy(struct bpf_iter_task_vma *it)
> struct bpf_iter_task_vma_kern *kit = (void *)it;
>
> if (kit->data) {
> - if (kit->data->locked_vma)
> - vma_end_read(kit->data->locked_vma);
> + if (kit->data->snapshot.vm_file)
> + fput(kit->data->snapshot.vm_file);
> bpf_iter_mmput(kit->data->mm);
> put_task_struct(kit->data->task);
> bpf_mem_free(&bpf_global_ma, kit->data);
> --
> 2.47.3
^ permalink raw reply [flat|nested] 17+ messages in thread* Re: [PATCH bpf v2 4/4] bpf: return VMA snapshot from task_vma iterator
2026-03-09 15:54 ` [PATCH bpf v2 4/4] bpf: return VMA snapshot from task_vma iterator Puranjay Mohan
2026-03-09 17:11 ` Mykyta Yatsenko
@ 2026-03-11 19:07 ` Andrii Nakryiko
2026-03-11 19:27 ` Puranjay Mohan
1 sibling, 1 reply; 17+ messages in thread
From: Andrii Nakryiko @ 2026-03-11 19:07 UTC (permalink / raw)
To: Puranjay Mohan
Cc: bpf, Puranjay Mohan, Alexei Starovoitov, Andrii Nakryiko,
Daniel Borkmann, Martin KaFai Lau, Eduard Zingerman,
Kumar Kartikeya Dwivedi, Mykyta Yatsenko, kernel-team
On Mon, Mar 9, 2026 at 8:55 AM Puranjay Mohan <puranjay@kernel.org> wrote:
>
> Holding the per-VMA lock across the BPF program body creates a lock
> ordering problem when helpers acquire locks that depend on mmap_lock:
>
> vm_lock -> i_rwsem -> mmap_lock -> vm_lock
>
> Snapshot VMA fields under the per-VMA lock in _next(), then drop the
> lock before returning. The BPF program accesses only the snapshot.
>
> Copy vm_start, vm_end, vm_flags, vm_pgoff, vm_page_prot, vm_file, and
> vm_mm. vm_file is reference-counted with get_file() under the lock and
> released via fput() on the next iteration or in _destroy(). vm_mm uses
> the mm pointer already held via mmget().
>
> Fixes: 4ac454682158 ("bpf: Introduce task_vma open-coded iterator kfuncs")
> Signed-off-by: Puranjay Mohan <puranjay@kernel.org>
> ---
> kernel/bpf/task_iter.c | 34 ++++++++++++++++++++++------------
> 1 file changed, 22 insertions(+), 12 deletions(-)
>
> diff --git a/kernel/bpf/task_iter.c b/kernel/bpf/task_iter.c
> index e20c85e06afa..f04d6e310fd3 100644
> --- a/kernel/bpf/task_iter.c
> +++ b/kernel/bpf/task_iter.c
> @@ -799,7 +799,7 @@ const struct bpf_func_proto bpf_find_vma_proto = {
> struct bpf_iter_task_vma_kern_data {
> struct task_struct *task;
> struct mm_struct *mm;
> - struct vm_area_struct *locked_vma;
> + struct vm_area_struct snapshot;
> u64 last_addr;
> };
>
> @@ -895,8 +895,8 @@ __bpf_kfunc int bpf_iter_task_vma_new(struct bpf_iter_task_vma *it,
> goto err_cleanup_iter;
> }
>
> - kit->data->locked_vma = NULL;
> kit->data->last_addr = addr;
> + memset(&kit->data->snapshot, 0, sizeof(kit->data->snapshot));
> return 0;
>
> err_cleanup_iter:
> @@ -954,23 +954,33 @@ bpf_iter_task_vma_find_next(struct bpf_iter_task_vma_kern_data *data)
> __bpf_kfunc struct vm_area_struct *bpf_iter_task_vma_next(struct bpf_iter_task_vma *it)
> {
> struct bpf_iter_task_vma_kern *kit = (void *)it;
> - struct vm_area_struct *vma;
> + struct vm_area_struct *snap, *vma;
>
> if (!kit->data) /* bpf_iter_task_vma_new failed */
> return NULL;
>
> - if (kit->data->locked_vma)
> - vma_end_read(kit->data->locked_vma);
> + snap = &kit->data->snapshot;
> +
> + if (snap->vm_file) {
> + fput(snap->vm_file);
> + snap->vm_file = NULL;
> + }
we do the same in _destroy, and we might get more "trusted" stuff
added, so to keep all that in sync, let's add a common function that
will "clean up" vma snapshot?
>
> vma = bpf_iter_task_vma_find_next(kit->data);
> - if (!vma) {
> - kit->data->locked_vma = NULL;
> + if (!vma)
> return NULL;
> - }
>
> - kit->data->locked_vma = vma;
> + snap->vm_start = vma->vm_start;
> + snap->vm_end = vma->vm_end;
> + snap->vm_mm = kit->data->mm;
> + snap->vm_page_prot = vma->vm_page_prot;
> + snap->flags = vma->flags;
> + snap->vm_pgoff = vma->vm_pgoff;
> + snap->vm_file = vma->vm_file ? get_file(vma->vm_file) : NULL;
> +
let's just memcpy() entire vma, and then get_file(vma->vm_file) for
the explicitly trusted pointers? All other pointers are still useful
even if untrusted (e.g., I can easily see useful application of
vm_ops+vm_private_data and, separately, anon_vma). It's safe and
correct to have those pointers, we just can't declare them as trusted.
PROBEMEM semantics works fine here.
> kit->data->last_addr = vma->vm_end;
> - return vma;
> + vma_end_read(vma);
> + return snap;
> }
>
> __bpf_kfunc void bpf_iter_task_vma_destroy(struct bpf_iter_task_vma *it)
> @@ -978,8 +988,8 @@ __bpf_kfunc void bpf_iter_task_vma_destroy(struct bpf_iter_task_vma *it)
> struct bpf_iter_task_vma_kern *kit = (void *)it;
>
> if (kit->data) {
> - if (kit->data->locked_vma)
> - vma_end_read(kit->data->locked_vma);
> + if (kit->data->snapshot.vm_file)
> + fput(kit->data->snapshot.vm_file);
> bpf_iter_mmput(kit->data->mm);
> put_task_struct(kit->data->task);
> bpf_mem_free(&bpf_global_ma, kit->data);
> --
> 2.47.3
>
^ permalink raw reply [flat|nested] 17+ messages in thread* Re: [PATCH bpf v2 4/4] bpf: return VMA snapshot from task_vma iterator
2026-03-11 19:07 ` Andrii Nakryiko
@ 2026-03-11 19:27 ` Puranjay Mohan
0 siblings, 0 replies; 17+ messages in thread
From: Puranjay Mohan @ 2026-03-11 19:27 UTC (permalink / raw)
To: Andrii Nakryiko
Cc: bpf, Alexei Starovoitov, Andrii Nakryiko, Daniel Borkmann,
Martin KaFai Lau, Eduard Zingerman, Kumar Kartikeya Dwivedi,
Mykyta Yatsenko, kernel-team
On Wed, Mar 11, 2026 at 7:07 PM Andrii Nakryiko
<andrii.nakryiko@gmail.com> wrote:
>
> On Mon, Mar 9, 2026 at 8:55 AM Puranjay Mohan <puranjay@kernel.org> wrote:
> >
> > Holding the per-VMA lock across the BPF program body creates a lock
> > ordering problem when helpers acquire locks that depend on mmap_lock:
> >
> > vm_lock -> i_rwsem -> mmap_lock -> vm_lock
> >
> > Snapshot VMA fields under the per-VMA lock in _next(), then drop the
> > lock before returning. The BPF program accesses only the snapshot.
> >
> > Copy vm_start, vm_end, vm_flags, vm_pgoff, vm_page_prot, vm_file, and
> > vm_mm. vm_file is reference-counted with get_file() under the lock and
> > released via fput() on the next iteration or in _destroy(). vm_mm uses
> > the mm pointer already held via mmget().
> >
> > Fixes: 4ac454682158 ("bpf: Introduce task_vma open-coded iterator kfuncs")
> > Signed-off-by: Puranjay Mohan <puranjay@kernel.org>
> > ---
> > kernel/bpf/task_iter.c | 34 ++++++++++++++++++++++------------
> > 1 file changed, 22 insertions(+), 12 deletions(-)
> >
> > diff --git a/kernel/bpf/task_iter.c b/kernel/bpf/task_iter.c
> > index e20c85e06afa..f04d6e310fd3 100644
> > --- a/kernel/bpf/task_iter.c
> > +++ b/kernel/bpf/task_iter.c
> > @@ -799,7 +799,7 @@ const struct bpf_func_proto bpf_find_vma_proto = {
> > struct bpf_iter_task_vma_kern_data {
> > struct task_struct *task;
> > struct mm_struct *mm;
> > - struct vm_area_struct *locked_vma;
> > + struct vm_area_struct snapshot;
> > u64 last_addr;
> > };
> >
> > @@ -895,8 +895,8 @@ __bpf_kfunc int bpf_iter_task_vma_new(struct bpf_iter_task_vma *it,
> > goto err_cleanup_iter;
> > }
> >
> > - kit->data->locked_vma = NULL;
> > kit->data->last_addr = addr;
> > + memset(&kit->data->snapshot, 0, sizeof(kit->data->snapshot));
> > return 0;
> >
> > err_cleanup_iter:
> > @@ -954,23 +954,33 @@ bpf_iter_task_vma_find_next(struct bpf_iter_task_vma_kern_data *data)
> > __bpf_kfunc struct vm_area_struct *bpf_iter_task_vma_next(struct bpf_iter_task_vma *it)
> > {
> > struct bpf_iter_task_vma_kern *kit = (void *)it;
> > - struct vm_area_struct *vma;
> > + struct vm_area_struct *snap, *vma;
> >
> > if (!kit->data) /* bpf_iter_task_vma_new failed */
> > return NULL;
> >
> > - if (kit->data->locked_vma)
> > - vma_end_read(kit->data->locked_vma);
> > + snap = &kit->data->snapshot;
> > +
> > + if (snap->vm_file) {
> > + fput(snap->vm_file);
> > + snap->vm_file = NULL;
> > + }
>
> we do the same in _destroy, and we might get more "trusted" stuff
> added, so to keep all that in sync, let's add a common function that
> will "clean up" vma snapshot?
>
> >
> > vma = bpf_iter_task_vma_find_next(kit->data);
> > - if (!vma) {
> > - kit->data->locked_vma = NULL;
> > + if (!vma)
> > return NULL;
> > - }
> >
> > - kit->data->locked_vma = vma;
> > + snap->vm_start = vma->vm_start;
> > + snap->vm_end = vma->vm_end;
> > + snap->vm_mm = kit->data->mm;
> > + snap->vm_page_prot = vma->vm_page_prot;
> > + snap->flags = vma->flags;
> > + snap->vm_pgoff = vma->vm_pgoff;
> > + snap->vm_file = vma->vm_file ? get_file(vma->vm_file) : NULL;
> > +
>
> let's just memcpy() entire vma, and then get_file(vma->vm_file) for
> the explicitly trusted pointers? All other pointers are still useful
> even if untrusted (e.g., I can easily see useful application of
> vm_ops+vm_private_data and, separately, anon_vma). It's safe and
> correct to have those pointers, we just can't declare them as trusted.
> PROBEMEM semantics works fine here.
Yes, doing that in the next version.
^ permalink raw reply [flat|nested] 17+ messages in thread