All of lore.kernel.org
 help / color / mirror / Atom feed
From: Puranjay Mohan <puranjay@kernel.org>
To: bpf@vger.kernel.org
Cc: Puranjay Mohan <puranjay@kernel.org>,
	Puranjay Mohan <puranjay12@gmail.com>,
	Alexei Starovoitov <ast@kernel.org>,
	Andrii Nakryiko <andrii@kernel.org>,
	Daniel Borkmann <daniel@iogearbox.net>,
	Martin KaFai Lau <martin.lau@kernel.org>,
	Eduard Zingerman <eddyz87@gmail.com>,
	Kumar Kartikeya Dwivedi <memxor@gmail.com>,
	Mykyta Yatsenko <mykyta.yatsenko5@gmail.com>,
	kernel-team@meta.com
Subject: [PATCH bpf v2 2/4] bpf: fix mm lifecycle in open-coded task_vma iterator
Date: Mon,  9 Mar 2026 08:54:56 -0700	[thread overview]
Message-ID: <20260309155506.23490-3-puranjay@kernel.org> (raw)
In-Reply-To: <20260309155506.23490-1-puranjay@kernel.org>

The open-coded task_vma iterator reads task->mm and acquires
mmap_read_trylock() but never calls mmget(). The mm can reach
mm_users == 0 if the task exits while the iterator holds the lock.

Add mmget_not_zero() before mmap_read_trylock(). Drop the mm reference
on error paths via mmput_async(), which is safe from process context.
From hardirq/NMI context mmput_async() can re-enter __queue_work() and
deadlock on pool->lock, so defer to irq_work in that case, following
the pattern from defer_timer_wq_op() in kernel/bpf/helpers.c.

Widen the mmput_async() #if guard to include CONFIG_BPF_SYSCALL,
following the same approach used for CONFIG_FUTEX_PRIVATE_HASH.

Fixes: 4ac454682158 ("bpf: Introduce task_vma open-coded iterator kfuncs")
Signed-off-by: Puranjay Mohan <puranjay@kernel.org>
---
 include/linux/sched/mm.h |  2 +-
 kernel/bpf/task_iter.c   | 65 +++++++++++++++++++++++++++++++++++++---
 kernel/fork.c            |  2 +-
 3 files changed, 63 insertions(+), 6 deletions(-)

diff --git a/include/linux/sched/mm.h b/include/linux/sched/mm.h
index 95d0040df584..5908de0c2f82 100644
--- a/include/linux/sched/mm.h
+++ b/include/linux/sched/mm.h
@@ -140,7 +140,7 @@ static inline bool mmget_not_zero(struct mm_struct *mm)
 
 /* mmput gets rid of the mappings and all user-space */
 extern void mmput(struct mm_struct *);
-#if defined(CONFIG_MMU) || defined(CONFIG_FUTEX_PRIVATE_HASH)
+#if defined(CONFIG_MMU) || defined(CONFIG_FUTEX_PRIVATE_HASH) || defined(CONFIG_BPF_SYSCALL)
 /* same as above but performs the slow path from the async context. Can
  * be called from the atomic context as well
  */
diff --git a/kernel/bpf/task_iter.c b/kernel/bpf/task_iter.c
index 7c302ee78f7e..e8efc9e1f602 100644
--- a/kernel/bpf/task_iter.c
+++ b/kernel/bpf/task_iter.c
@@ -9,6 +9,7 @@
 #include <linux/bpf_mem_alloc.h>
 #include <linux/btf_ids.h>
 #include <linux/mm_types.h>
+#include <linux/sched/mm.h>
 #include "mmap_unlock_work.h"
 
 static const char * const iter_task_type_names[] = {
@@ -813,6 +814,44 @@ struct bpf_iter_task_vma_kern {
 	struct bpf_iter_task_vma_kern_data *data;
 } __attribute__((aligned(8)));
 
+/*
+ * mmput_async() is unsafe when IRQs are disabled because it can re-enter
+ * __queue_work() and deadlock on pool->lock. Defer to irq_work in that case,
+ * same pattern as defer_timer_wq_op() in kernel/bpf/helpers.c.
+ */
+static DEFINE_PER_CPU(struct bpf_iter_mm_irq_work, bpf_iter_mmput_work);
+
+static void do_bpf_iter_mmput(struct irq_work *entry)
+{
+	struct bpf_iter_mm_irq_work *work;
+
+	work = container_of(entry, struct bpf_iter_mm_irq_work, irq_work);
+	if (work->mm) {
+		mmput_async(work->mm);
+		work->mm = NULL;
+	}
+}
+
+static void bpf_iter_mmput(struct mm_struct *mm)
+{
+	if (!in_hardirq() && !irqs_disabled()) {
+		mmput_async(mm);
+	} else {
+		struct bpf_iter_mm_irq_work *work;
+
+		work = this_cpu_ptr(&bpf_iter_mmput_work);
+		work->mm = mm;
+		irq_work_queue(&work->irq_work);
+	}
+}
+
+static bool bpf_iter_mmput_busy(void)
+{
+	if (!in_hardirq() && !irqs_disabled())
+		return false;
+	return irq_work_is_busy(&this_cpu_ptr(&bpf_iter_mmput_work)->irq_work);
+}
+
 __bpf_kfunc_start_defs();
 
 __bpf_kfunc int bpf_iter_task_vma_new(struct bpf_iter_task_vma *it,
@@ -840,19 +879,33 @@ __bpf_kfunc int bpf_iter_task_vma_new(struct bpf_iter_task_vma *it,
 		goto err_cleanup_iter;
 	}
 
-	/* kit->data->work == NULL is valid after bpf_mmap_unlock_get_irq_work */
+	/*
+	 * Both mmap_lock and mmput irq_work slots must be free for _destroy().
+	 * kit->data->work == NULL is valid after bpf_mmap_unlock_get_irq_work
+	 */
 	irq_work_busy = bpf_mmap_unlock_get_irq_work(&kit->data->work);
-	if (irq_work_busy || !mmap_read_trylock(kit->data->mm)) {
+	if (irq_work_busy || bpf_iter_mmput_busy()) {
 		err = -EBUSY;
 		goto err_cleanup_iter;
 	}
 
+	if (!mmget_not_zero(kit->data->mm)) {
+		err = -ENOENT;
+		goto err_cleanup_iter;
+	}
+
+	if (!mmap_read_trylock(kit->data->mm)) {
+		err = -EBUSY;
+		goto err_cleanup_mmget;
+	}
+
 	vma_iter_init(&kit->data->vmi, kit->data->mm, addr);
 	return 0;
 
+err_cleanup_mmget:
+	bpf_iter_mmput(kit->data->mm);
 err_cleanup_iter:
-	if (kit->data->task)
-		put_task_struct(kit->data->task);
+	put_task_struct(kit->data->task);
 	bpf_mem_free(&bpf_global_ma, kit->data);
 	/* NULL kit->data signals failed bpf_iter_task_vma initialization */
 	kit->data = NULL;
@@ -874,6 +927,7 @@ __bpf_kfunc void bpf_iter_task_vma_destroy(struct bpf_iter_task_vma *it)
 
 	if (kit->data) {
 		bpf_mmap_unlock_mm(kit->data->work, kit->data->mm);
+		bpf_iter_mmput(kit->data->mm);
 		put_task_struct(kit->data->task);
 		bpf_mem_free(&bpf_global_ma, kit->data);
 	}
@@ -1044,12 +1098,15 @@ static void do_mmap_read_unlock(struct irq_work *entry)
 
 static int __init task_iter_init(void)
 {
+	struct bpf_iter_mm_irq_work *mmput_work;
 	struct bpf_iter_mm_irq_work *work;
 	int ret, cpu;
 
 	for_each_possible_cpu(cpu) {
 		work = per_cpu_ptr(&mmap_unlock_work, cpu);
 		init_irq_work(&work->irq_work, do_mmap_read_unlock);
+		mmput_work = per_cpu_ptr(&bpf_iter_mmput_work, cpu);
+		init_irq_work(&mmput_work->irq_work, do_bpf_iter_mmput);
 	}
 
 	task_reg_info.ctx_arg_info[0].btf_id = btf_tracing_ids[BTF_TRACING_TYPE_TASK];
diff --git a/kernel/fork.c b/kernel/fork.c
index 65113a304518..d0411a63d4ab 100644
--- a/kernel/fork.c
+++ b/kernel/fork.c
@@ -1198,7 +1198,7 @@ void mmput(struct mm_struct *mm)
 }
 EXPORT_SYMBOL_GPL(mmput);
 
-#if defined(CONFIG_MMU) || defined(CONFIG_FUTEX_PRIVATE_HASH)
+#if defined(CONFIG_MMU) || defined(CONFIG_FUTEX_PRIVATE_HASH) || defined(CONFIG_BPF_SYSCALL)
 static void mmput_async_fn(struct work_struct *work)
 {
 	struct mm_struct *mm = container_of(work, struct mm_struct,
-- 
2.47.3


  parent reply	other threads:[~2026-03-09 15:55 UTC|newest]

Thread overview: 17+ messages / expand[flat|nested]  mbox.gz  Atom feed  top
2026-03-09 15:54 [PATCH bpf v2 0/4] bpf: fix and improve open-coded task_vma iterator Puranjay Mohan
2026-03-09 15:54 ` [PATCH bpf v2 1/4] bpf: rename mmap_unlock_irq_work to bpf_iter_mm_irq_work Puranjay Mohan
2026-03-11 18:32   ` Andrii Nakryiko
2026-03-09 15:54 ` Puranjay Mohan [this message]
2026-03-09 16:48   ` [PATCH bpf v2 2/4] bpf: fix mm lifecycle in open-coded task_vma iterator Alexei Starovoitov
2026-03-09 18:02     ` Puranjay Mohan
2026-03-09 18:12       ` Alexei Starovoitov
2026-03-11 18:35   ` Andrii Nakryiko
2026-03-09 15:54 ` [PATCH bpf v2 3/4] bpf: switch task_vma iterator from mmap_lock to per-VMA locks Puranjay Mohan
2026-03-09 16:33   ` bot+bpf-ci
2026-03-11 19:00   ` Andrii Nakryiko
2026-03-11 19:25     ` Puranjay Mohan
2026-03-11 23:54       ` Andrii Nakryiko
2026-03-09 15:54 ` [PATCH bpf v2 4/4] bpf: return VMA snapshot from task_vma iterator Puranjay Mohan
2026-03-09 17:11   ` Mykyta Yatsenko
2026-03-11 19:07   ` Andrii Nakryiko
2026-03-11 19:27     ` Puranjay Mohan

Reply instructions:

You may reply publicly to this message via plain-text email
using any one of the following methods:

* Save the following mbox file, import it into your mail client,
  and reply-to-all from there: mbox

  Avoid top-posting and favor interleaved quoting:
  https://en.wikipedia.org/wiki/Posting_style#Interleaved_style

* Reply using the --to, --cc, and --in-reply-to
  switches of git-send-email(1):

  git send-email \
    --in-reply-to=20260309155506.23490-3-puranjay@kernel.org \
    --to=puranjay@kernel.org \
    --cc=andrii@kernel.org \
    --cc=ast@kernel.org \
    --cc=bpf@vger.kernel.org \
    --cc=daniel@iogearbox.net \
    --cc=eddyz87@gmail.com \
    --cc=kernel-team@meta.com \
    --cc=martin.lau@kernel.org \
    --cc=memxor@gmail.com \
    --cc=mykyta.yatsenko5@gmail.com \
    --cc=puranjay12@gmail.com \
    /path/to/YOUR_REPLY

  https://kernel.org/pub/software/scm/git/docs/git-send-email.html

* If your mail client supports setting the In-Reply-To header
  via mailto: links, try the mailto: link
Be sure your reply has a Subject: header at the top and a blank line before the message body.
This is an external index of several public inboxes,
see mirroring instructions on how to clone and mirror
all data and code used by this external index.