From: Suren Baghdasaryan <surenb@google.com>
To: akpm@linux-foundation.org
Cc: liam@infradead.org, ljs@kernel.org, vbabka@kernel.org,
david@redhat.com, willy@infradead.org, jannh@google.com,
paulmck@kernel.org, pfalcato@suse.de, shuah@kernel.org,
hsukrut3@gmail.com, richard.weiyang@gmail.com,
reddybalavignesh9979@gmail.com, linux-mm@kvack.org,
linux-kernel@vger.kernel.org, linux-fsdevel@vger.kernel.org,
linux-kselftest@vger.kernel.org, surenb@google.com
Subject: [PATCH 1/3] fs/proc/task_mmu: read proc/pid/{smaps|numa_maps} under per-vma lock
Date: Fri, 24 Apr 2026 00:02:32 -0700 [thread overview]
Message-ID: <20260424070234.190145-2-surenb@google.com> (raw)
In-Reply-To: <20260424070234.190145-1-surenb@google.com>
proc/pid/{smaps|numa_maps} can be read using the combination of RCU and
VMA read locks, similar to proc/pid/maps. RCU is required to safely
traverse the VMA tree and VMA lock stabilizes the VMA being processed
and the pagetable walk.
Signed-off-by: Suren Baghdasaryan <surenb@google.com>
---
fs/proc/task_mmu.c | 193 ++++++++++++++++++++++++++++++++++++---------
1 file changed, 154 insertions(+), 39 deletions(-)
diff --git a/fs/proc/task_mmu.c b/fs/proc/task_mmu.c
index 751b9ba160fb..96cfea252db6 100644
--- a/fs/proc/task_mmu.c
+++ b/fs/proc/task_mmu.c
@@ -132,6 +132,22 @@ static void release_task_mempolicy(struct proc_maps_private *priv)
#ifdef CONFIG_PER_VMA_LOCK
+static inline int lock_ctx_mm(struct proc_maps_locking_ctx *lock_ctx)
+{
+ int ret = mmap_read_lock_killable(lock_ctx->mm);
+
+ if (!ret)
+ lock_ctx->mmap_locked = true;
+
+ return ret;
+}
+
+static inline void unlock_ctx_mm(struct proc_maps_locking_ctx *lock_ctx)
+{
+ mmap_read_unlock(lock_ctx->mm);
+ lock_ctx->mmap_locked = false;
+}
+
static void reset_lock_ctx(struct proc_maps_locking_ctx *lock_ctx)
{
lock_ctx->locked_vma = NULL;
@@ -146,25 +162,11 @@ static void unlock_ctx_vma(struct proc_maps_locking_ctx *lock_ctx)
}
}
-static const struct seq_operations proc_pid_maps_op;
-
static inline bool lock_vma_range(struct seq_file *m,
struct proc_maps_locking_ctx *lock_ctx)
{
- /*
- * smaps and numa_maps perform page table walk, therefore require
- * mmap_lock but maps can be read with locking just the vma and
- * walking the vma tree under rcu read protection.
- */
- if (m->op != &proc_pid_maps_op) {
- if (mmap_read_lock_killable(lock_ctx->mm))
- return false;
-
- lock_ctx->mmap_locked = true;
- } else {
- rcu_read_lock();
- reset_lock_ctx(lock_ctx);
- }
+ rcu_read_lock();
+ reset_lock_ctx(lock_ctx);
return true;
}
@@ -172,7 +174,7 @@ static inline bool lock_vma_range(struct seq_file *m,
static inline void unlock_vma_range(struct proc_maps_locking_ctx *lock_ctx)
{
if (lock_ctx->mmap_locked) {
- mmap_read_unlock(lock_ctx->mm);
+ unlock_ctx_mm(lock_ctx);
} else {
unlock_ctx_vma(lock_ctx);
rcu_read_unlock();
@@ -213,17 +215,45 @@ static inline bool fallback_to_mmap_lock(struct proc_maps_private *priv,
return true;
}
+static inline void drop_rcu(struct proc_maps_private *priv)
+{
+ if (priv->lock_ctx.mmap_locked)
+ return;
+
+ rcu_read_unlock();
+}
+
+static inline void reacquire_rcu(struct proc_maps_private *priv)
+{
+ if (priv->lock_ctx.mmap_locked)
+ return;
+
+ rcu_read_lock();
+ /* Reinitialize the iterator. */
+ vma_iter_set(&priv->iter, priv->lock_ctx.locked_vma->vm_end);
+}
+
#else /* CONFIG_PER_VMA_LOCK */
+static inline int lock_ctx_mm(struct proc_maps_locking_ctx *lock_ctx)
+{
+ return mmap_read_lock_killable(lock_ctx->mm);
+}
+
+static inline void unlock_ctx_mm(struct proc_maps_locking_ctx *lock_ctx)
+{
+ mmap_read_unlock(lock_ctx->mm);
+}
+
static inline bool lock_vma_range(struct seq_file *m,
struct proc_maps_locking_ctx *lock_ctx)
{
- return mmap_read_lock_killable(lock_ctx->mm) == 0;
+ return lock_ctx_mm(lock_ctx) == 0;
}
static inline void unlock_vma_range(struct proc_maps_locking_ctx *lock_ctx)
{
- mmap_read_unlock(lock_ctx->mm);
+ unlock_ctx_mm(lock_ctx);
}
static struct vm_area_struct *get_next_vma(struct proc_maps_private *priv,
@@ -238,6 +268,9 @@ static inline bool fallback_to_mmap_lock(struct proc_maps_private *priv,
return false;
}
+static inline void drop_rcu(struct proc_maps_private *priv) {}
+static inline void reacquire_rcu(struct proc_maps_private *priv) {}
+
#endif /* CONFIG_PER_VMA_LOCK */
static struct vm_area_struct *proc_get_vma(struct seq_file *m, loff_t *ppos)
@@ -538,12 +571,10 @@ static int query_vma_setup(struct proc_maps_locking_ctx *lock_ctx)
static void query_vma_teardown(struct proc_maps_locking_ctx *lock_ctx)
{
- if (lock_ctx->mmap_locked) {
- mmap_read_unlock(lock_ctx->mm);
- lock_ctx->mmap_locked = false;
- } else {
+ if (lock_ctx->mmap_locked)
+ unlock_ctx_mm(lock_ctx);
+ else
unlock_ctx_vma(lock_ctx);
- }
}
static struct vm_area_struct *query_vma_find_by_addr(struct proc_maps_locking_ctx *lock_ctx,
@@ -1280,16 +1311,64 @@ static const struct mm_walk_ops smaps_shmem_walk_ops = {
.walk_lock = PGWALK_RDLOCK,
};
+#ifdef CONFIG_PER_VMA_LOCK
+
+static const struct mm_walk_ops smaps_walk_vma_lock_ops = {
+ .pmd_entry = smaps_pte_range,
+ .hugetlb_entry = smaps_hugetlb_range,
+ .walk_lock = PGWALK_VMA_RDLOCK_VERIFY,
+};
+
+static const struct mm_walk_ops smaps_shmem_walk_vma_lock_ops = {
+ .pmd_entry = smaps_pte_range,
+ .hugetlb_entry = smaps_hugetlb_range,
+ .pte_hole = smaps_pte_hole,
+ .walk_lock = PGWALK_VMA_RDLOCK_VERIFY,
+};
+
+static inline const struct mm_walk_ops *
+get_smaps_walk_ops(struct proc_maps_private *priv)
+{
+ if (priv->lock_ctx.mmap_locked)
+ return &smaps_walk_ops;
+ return &smaps_walk_vma_lock_ops;
+}
+
+static inline const struct mm_walk_ops *
+get_smaps_shmem_walk_ops(struct proc_maps_private *priv)
+{
+ if (priv->lock_ctx.mmap_locked)
+ return &smaps_shmem_walk_ops;
+ return &smaps_shmem_walk_vma_lock_ops;
+}
+
+#else /* CONFIG_PER_VMA_LOCK */
+
+static inline const struct mm_walk_ops *
+get_smaps_walk_ops(struct proc_maps_private *priv)
+{
+ return &smaps_walk_ops;
+}
+
+static inline const struct mm_walk_ops *
+get_smaps_shmem_walk_ops(struct proc_maps_private *priv)
+{
+ return &smaps_shmem_walk_ops;
+}
+
+#endif /* CONFIG_PER_VMA_LOCK */
+
/*
* Gather mem stats from @vma with the indicated beginning
* address @start, and keep them in @mss.
*
* Use vm_start of @vma as the beginning address if @start is 0.
*/
-static void smap_gather_stats(struct vm_area_struct *vma,
- struct mem_size_stats *mss, unsigned long start)
+static void smap_gather_stats(struct proc_maps_private *priv,
+ struct vm_area_struct *vma,
+ struct mem_size_stats *mss, unsigned long start)
{
- const struct mm_walk_ops *ops = &smaps_walk_ops;
+ const struct mm_walk_ops *ops = get_smaps_walk_ops(priv);
/* Invalid start */
if (start >= vma->vm_end)
@@ -1312,15 +1391,24 @@ static void smap_gather_stats(struct vm_area_struct *vma,
!(vma->vm_flags & VM_WRITE))) {
mss->swap += shmem_swapped;
} else {
- ops = &smaps_shmem_walk_ops;
+ ops = get_smaps_shmem_walk_ops(priv);
}
}
- /* mmap_lock is held in m_start */
+ /* Skip walking pages if gate VMA */
+ if (vma == get_gate_vma(priv->lock_ctx.mm))
+ return;
+
+ /*
+ * Need to drop RCU read lock before the walk due to possibility of sleep.
+ * Note that the VMA is still locked.
+ */
+ drop_rcu(priv);
if (!start)
walk_page_vma(vma, ops, mss);
else
walk_page_range(vma->vm_mm, start, vma->vm_end, ops, mss);
+ reacquire_rcu(priv);
}
#define SEQ_PUT_DEC(str, val) \
@@ -1369,10 +1457,11 @@ static void __show_smap(struct seq_file *m, const struct mem_size_stats *mss,
static int show_smap(struct seq_file *m, void *v)
{
+ struct proc_maps_private *priv = m->private;
struct vm_area_struct *vma = v;
struct mem_size_stats mss = {};
- smap_gather_stats(vma, &mss, 0);
+ smap_gather_stats(priv, vma, &mss, 0);
show_map_vma(m, vma);
@@ -1413,7 +1502,7 @@ static int show_smaps_rollup(struct seq_file *m, void *v)
goto out_put_task;
}
- ret = mmap_read_lock_killable(mm);
+ ret = lock_ctx_mm(&priv->lock_ctx);
if (ret)
goto out_put_mm;
@@ -1425,7 +1514,7 @@ static int show_smaps_rollup(struct seq_file *m, void *v)
vma_start = vma->vm_start;
do {
- smap_gather_stats(vma, &mss, 0);
+ smap_gather_stats(priv, vma, &mss, 0);
last_vma_end = vma->vm_end;
/*
@@ -1434,8 +1523,8 @@ static int show_smaps_rollup(struct seq_file *m, void *v)
*/
if (mmap_lock_is_contended(mm)) {
vma_iter_invalidate(&vmi);
- mmap_read_unlock(mm);
- ret = mmap_read_lock_killable(mm);
+ unlock_ctx_mm(&priv->lock_ctx);
+ ret = lock_ctx_mm(&priv->lock_ctx);
if (ret) {
release_task_mempolicy(priv);
goto out_put_mm;
@@ -1484,14 +1573,14 @@ static int show_smaps_rollup(struct seq_file *m, void *v)
/* Case 1 and 2 above */
if (vma->vm_start >= last_vma_end) {
- smap_gather_stats(vma, &mss, 0);
+ smap_gather_stats(priv, vma, &mss, 0);
last_vma_end = vma->vm_end;
continue;
}
/* Case 4 above */
if (vma->vm_end > last_vma_end) {
- smap_gather_stats(vma, &mss, last_vma_end);
+ smap_gather_stats(priv, vma, &mss, last_vma_end);
last_vma_end = vma->vm_end;
}
}
@@ -1505,7 +1594,7 @@ static int show_smaps_rollup(struct seq_file *m, void *v)
__show_smap(m, &mss, true);
release_task_mempolicy(priv);
- mmap_read_unlock(mm);
+ unlock_ctx_mm(&priv->lock_ctx);
out_put_mm:
mmput(mm);
@@ -3291,6 +3380,31 @@ static const struct mm_walk_ops show_numa_ops = {
.walk_lock = PGWALK_RDLOCK,
};
+#ifdef CONFIG_PER_VMA_LOCK
+static const struct mm_walk_ops show_numa_vma_lock_ops = {
+ .hugetlb_entry = gather_hugetlb_stats,
+ .pmd_entry = gather_pte_stats,
+ .walk_lock = PGWALK_VMA_RDLOCK_VERIFY,
+};
+
+static inline const struct mm_walk_ops *
+get_show_numa_ops(struct proc_maps_private *priv)
+{
+ if (priv->lock_ctx.mmap_locked)
+ return &show_numa_ops;
+ return &show_numa_vma_lock_ops;
+}
+
+#else /* CONFIG_PER_VMA_LOCK */
+
+static inline const struct mm_walk_ops *
+get_show_numa_ops(struct proc_maps_private *priv)
+{
+ return &show_numa_ops;
+}
+
+#endif /* CONFIG_PER_VMA_LOCK */
+
/*
* Display pages allocated per node and memory policy via /proc.
*/
@@ -3335,8 +3449,9 @@ static int show_numa_map(struct seq_file *m, void *v)
if (is_vm_hugetlb_page(vma))
seq_puts(m, " huge");
- /* mmap_lock is held by m_start */
- walk_page_vma(vma, &show_numa_ops, md);
+ drop_rcu(proc_priv);
+ walk_page_vma(vma, get_show_numa_ops(proc_priv), md);
+ reacquire_rcu(proc_priv);
if (!md->pages)
goto out;
--
2.54.0.545.g6539524ca2-goog
next prev parent reply other threads:[~2026-04-24 7:02 UTC|newest]
Thread overview: 10+ messages / expand[flat|nested] mbox.gz Atom feed top
2026-04-24 7:02 [PATCH 0/3] use vma locks for proc/pid/{smaps|numa_maps} reads Suren Baghdasaryan
2026-04-24 7:02 ` Suren Baghdasaryan [this message]
2026-04-24 18:19 ` [PATCH 1/3] fs/proc/task_mmu: read proc/pid/{smaps|numa_maps} under per-vma lock Liam R. Howlett
2026-04-24 7:02 ` [PATCH 2/3] selftests/proc: ensure the test is performed at the right page boundary Suren Baghdasaryan
2026-04-24 18:21 ` Liam R. Howlett
2026-04-24 19:58 ` Suren Baghdasaryan
2026-04-24 7:02 ` [PATCH 3/3] selftests/proc: add /proc/pid/smaps tearing tests Suren Baghdasaryan
2026-04-24 18:23 ` Liam R. Howlett
2026-04-24 19:59 ` Suren Baghdasaryan
2026-04-25 23:48 ` Suren Baghdasaryan
Reply instructions:
You may reply publicly to this message via plain-text email
using any one of the following methods:
* Save the following mbox file, import it into your mail client,
and reply-to-all from there: mbox
Avoid top-posting and favor interleaved quoting:
https://en.wikipedia.org/wiki/Posting_style#Interleaved_style
* Reply using the --to, --cc, and --in-reply-to
switches of git-send-email(1):
git send-email \
--in-reply-to=20260424070234.190145-2-surenb@google.com \
--to=surenb@google.com \
--cc=akpm@linux-foundation.org \
--cc=david@redhat.com \
--cc=hsukrut3@gmail.com \
--cc=jannh@google.com \
--cc=liam@infradead.org \
--cc=linux-fsdevel@vger.kernel.org \
--cc=linux-kernel@vger.kernel.org \
--cc=linux-kselftest@vger.kernel.org \
--cc=linux-mm@kvack.org \
--cc=ljs@kernel.org \
--cc=paulmck@kernel.org \
--cc=pfalcato@suse.de \
--cc=reddybalavignesh9979@gmail.com \
--cc=richard.weiyang@gmail.com \
--cc=shuah@kernel.org \
--cc=vbabka@kernel.org \
--cc=willy@infradead.org \
/path/to/YOUR_REPLY
https://kernel.org/pub/software/scm/git/docs/git-send-email.html
* If your mail client supports setting the In-Reply-To header
via mailto: links, try the mailto: link
Be sure your reply has a Subject: header at the top and a blank line
before the message body.
This is an external index of several public inboxes,
see mirroring instructions on how to clone and mirror
all data and code used by this external index.