[PATCH 1/2] fs/proc/task_mmu: change lock_vma

All of lore.kernel.org
 help / color / mirror / Atom feed

* [PATCH 1/2] fs/proc/task_mmu: change lock_vma_range() to return error code
@ 2026-06-06  1:57 Suren Baghdasaryan
  2026-06-06  1:57 ` [PATCH 2/2] fs/proc/task_mmu: read proc/pid/smaps_rollup under per-vma lock Suren Baghdasaryan
                   ` (2 more replies)
  0 siblings, 3 replies; 17+ messages in thread
From: Suren Baghdasaryan @ 2026-06-06  1:57 UTC (permalink / raw)
  To: akpm
  Cc: liam, ljs, vbabka, david, willy, jannh, paulmck, pfalcato,
	linux-mm, linux-kernel, linux-fsdevel, surenb

To correctly propagate error code from lock_vma_range(), change it to
return the error code instead of the boolean. This simplifies error
propagation code.

Signed-off-by: Suren Baghdasaryan <surenb@google.com>
---
 fs/proc/task_mmu.c | 16 +++++++++-------
 1 file changed, 9 insertions(+), 7 deletions(-)

diff --git a/fs/proc/task_mmu.c b/fs/proc/task_mmu.c
index d32408f7cd5e..023422fcee12 100644
--- a/fs/proc/task_mmu.c
+++ b/fs/proc/task_mmu.c
@@ -162,13 +162,13 @@ static void unlock_ctx_vma(struct proc_maps_locking_ctx *lock_ctx)
 	}
 }
 
-static inline bool lock_vma_range(struct seq_file *m,
-				  struct proc_maps_locking_ctx *lock_ctx)
+static inline int lock_vma_range(struct seq_file *m,
+				 struct proc_maps_locking_ctx *lock_ctx)
 {
 	rcu_read_lock();
 	reset_lock_ctx(lock_ctx);
 
-	return true;
+	return 0;
 }
 
 static inline void unlock_vma_range(struct proc_maps_locking_ctx *lock_ctx)
@@ -245,10 +245,10 @@ static inline void unlock_ctx_mm(struct proc_maps_locking_ctx *lock_ctx)
 	mmap_read_unlock(lock_ctx->mm);
 }
 
-static inline bool lock_vma_range(struct seq_file *m,
+static inline int lock_vma_range(struct seq_file *m,
 				  struct proc_maps_locking_ctx *lock_ctx)
 {
-	return lock_ctx_mm(lock_ctx) == 0;
+	return lock_ctx_mm(lock_ctx);
 }
 
 static inline void unlock_vma_range(struct proc_maps_locking_ctx *lock_ctx)
@@ -311,6 +311,7 @@ static void *m_start(struct seq_file *m, loff_t *ppos)
 	struct proc_maps_locking_ctx *lock_ctx;
 	loff_t last_addr = *ppos;
 	struct mm_struct *mm;
+	int err;
 
 	/* See m_next(). Zero at the start or after lseek. */
 	if (last_addr == SENTINEL_VMA_END)
@@ -328,11 +329,12 @@ static void *m_start(struct seq_file *m, loff_t *ppos)
 		return NULL;
 	}
 
-	if (!lock_vma_range(m, lock_ctx)) {
+	err = lock_vma_range(m, lock_ctx);
+	if (err) {
 		mmput(mm);
 		put_task_struct(priv->task);
 		priv->task = NULL;
-		return ERR_PTR(-EINTR);
+		return ERR_PTR(err);
 	}
 
 	/*

base-commit: e178a530a81621a29efbca49b3b78202a18236e4
-- 
2.54.0.1032.g2f8565e1d1-goog


^ permalink raw reply related	[flat|nested] 17+ messages in thread

* [PATCH 2/2] fs/proc/task_mmu: read proc/pid/smaps_rollup under per-vma lock
  2026-06-06  1:57 [PATCH 1/2] fs/proc/task_mmu: change lock_vma_range() to return error code Suren Baghdasaryan
@ 2026-06-06  1:57 ` Suren Baghdasaryan
  2026-06-06  2:00   ` Suren Baghdasaryan
                     ` (3 more replies)
  2026-06-08 15:38 ` [PATCH 1/2] fs/proc/task_mmu: change lock_vma_range() to return error code David Hildenbrand (Arm)
  2026-06-09  8:27 ` Lorenzo Stoakes
  2 siblings, 4 replies; 17+ messages in thread
From: Suren Baghdasaryan @ 2026-06-06  1:57 UTC (permalink / raw)
  To: akpm
  Cc: liam, ljs, vbabka, david, willy, jannh, paulmck, pfalcato,
	linux-mm, linux-kernel, linux-fsdevel, surenb

proc/pid/smaps_rollup can be read using the combination of RCU and
VMA read locks, similar to proc/pid/{maps|smaps|numa_maps}. RCU is
required to safely traverse the VMA tree and VMA lock stabilizes the
VMA being processed and the pagetable walk.
Note that we have to keep the logic to drop mmap_lock on contention
because even when using per-VMA locks we might have to fall back to
holding the mmap_lock.

Running Paul's contention benchmark [1] shows considerable improvement
both in median and in the worst case latencies:

Execution command: run-proc-vs-map.sh --nsamples 20 --rawdata -- \
--busyduration 2 --procfile smaps_rollup

Baseline:

    0.174     0.161     2.553
    0.174     0.164     2.663
    0.174     0.165     2.664
    0.174     0.166     2.679
    0.174     0.167     2.691
    0.174     0.168     2.704
    0.174     0.169     2.729
    0.174     0.172     2.741
    0.174     0.174     2.745
    0.174     0.174     2.755
    0.174     0.175     2.790
    0.174     0.177     2.809
    0.174     0.179     3.096
    0.174     0.183     3.144
    0.174     0.184     3.158
    0.174     0.185     3.175
    0.174     0.185     4.568
    0.174     0.198     4.821
    0.174     0.214     5.143
    0.174     0.251     5.220

Patched:

    0.007     0.007     1.952
    0.007     0.007     1.955
    0.007     0.007     1.955
    0.007     0.007     1.955
    0.007     0.007     1.957
    0.007     0.007     1.969
    0.007     0.007     2.065
    0.007     0.007     2.075
    0.007     0.007     2.146
    0.007     0.007     2.195
    0.007     0.007     2.223
    0.007     0.007     2.259
    0.007     0.007     2.488
    0.007     0.007     2.562
    0.007     0.007     2.599
    0.007     0.007     2.697
    0.007     0.007     3.030
    0.007     0.007     3.075
    0.007     0.007     3.145
    0.007     0.007     3.225

[1] https://github.com/paulmckrcu/proc-mmap_sem-test

Signed-off-by: Suren Baghdasaryan <surenb@google.com>
---
 fs/proc/task_mmu.c | 134 ++++++++++++++++++++-------------------------
 1 file changed, 59 insertions(+), 75 deletions(-)

diff --git a/fs/proc/task_mmu.c b/fs/proc/task_mmu.c
index 023422fcee12..c2bd9f5bbbcd 100644
--- a/fs/proc/task_mmu.c
+++ b/fs/proc/task_mmu.c
@@ -233,6 +233,16 @@ static inline void reacquire_rcu(struct proc_maps_private *priv)
 	vma_iter_set(&priv->iter, priv->lock_ctx.locked_vma->vm_end);
 }
 
+static inline bool is_mmap_lock_contended(struct proc_maps_private *priv)
+{
+	struct proc_maps_locking_ctx *lock_ctx = &priv->lock_ctx;
+
+	if (!lock_ctx->mmap_locked)
+		return false;
+
+	return !!mmap_lock_is_contended(lock_ctx->mm);
+}
+
 #else /* CONFIG_PER_VMA_LOCK */
 
 static inline int lock_ctx_mm(struct proc_maps_locking_ctx *lock_ctx)
@@ -268,6 +278,11 @@ static inline bool fallback_to_mmap_lock(struct proc_maps_private *priv,
 	return false;
 }
 
+static inline bool is_mmap_lock_contended(struct proc_maps_private *priv)
+{
+	return !!mmap_lock_is_contended(priv->lock_ctx.mm);
+}
+
 static inline void drop_rcu(struct proc_maps_private *priv) {}
 static inline void reacquire_rcu(struct proc_maps_private *priv) {}
 
@@ -1486,12 +1501,15 @@ static int show_smap(struct seq_file *m, void *v)
 static int show_smaps_rollup(struct seq_file *m, void *v)
 {
 	struct proc_maps_private *priv = m->private;
+	struct proc_maps_locking_ctx *lock_ctx = &priv->lock_ctx;
+	struct mm_struct *mm = lock_ctx->mm;
 	struct mem_size_stats mss = {};
-	struct mm_struct *mm = priv->lock_ctx.mm;
 	struct vm_area_struct *vma;
-	unsigned long vma_start = 0, last_vma_end = 0;
+	unsigned long vma_start = 0;
+	unsigned long last_vma_end = 0;
+	loff_t pos = 0;
 	int ret = 0;
-	VMA_ITERATOR(vmi, mm, 0);
+
 
 	priv->task = get_proc_task(priv->inode);
 	if (!priv->task)
@@ -1502,89 +1520,55 @@ static int show_smaps_rollup(struct seq_file *m, void *v)
 		goto out_put_task;
 	}
 
-	ret = lock_ctx_mm(&priv->lock_ctx);
+	hold_task_mempolicy(priv);
+	ret = lock_vma_range(m, lock_ctx);
 	if (ret)
 		goto out_put_mm;
 
-	hold_task_mempolicy(priv);
-	vma = vma_next(&vmi);
-
-	if (unlikely(!vma))
+	vma_iter_init(&priv->iter, mm, 0);
+	vma = proc_get_vma(m, &pos);
+	if (unlikely(!vma) || vma == get_gate_vma(priv->lock_ctx.mm))
 		goto empty_set;
 
+	if (IS_ERR(vma)) {
+		ret = PTR_ERR(vma);
+		goto out_unlock;
+	}
+
 	vma_start = vma->vm_start;
-	do {
-		smap_gather_stats(priv, vma, &mss, 0);
+	while (vma) {
+		if (IS_ERR(vma)) {
+			ret = PTR_ERR(vma);
+			goto out_unlock;
+		}
+
+		if (vma == get_gate_vma(priv->lock_ctx.mm))
+			break;
+
+		/*
+		 * If after retaking mmap_lock, already reported VMA grew or
+		 * merged with the next one, then iterate from last_vma_end.
+		 */
+		smap_gather_stats(priv, vma, &mss,
+				  vma->vm_start < last_vma_end ? last_vma_end : 0);
 		last_vma_end = vma->vm_end;
 
 		/*
 		 * Release mmap_lock temporarily if someone wants to
-		 * access it for write request.
+		 * take it for write request.
 		 */
-		if (mmap_lock_is_contended(mm)) {
-			vma_iter_invalidate(&vmi);
-			unlock_ctx_mm(&priv->lock_ctx);
-			ret = lock_ctx_mm(&priv->lock_ctx);
-			if (ret) {
-				release_task_mempolicy(priv);
+		if (is_mmap_lock_contended(priv)) {
+			unlock_vma_range(&priv->lock_ctx);
+			ret = lock_vma_range(m, lock_ctx);
+			if (ret)
 				goto out_put_mm;
-			}
-
-			/*
-			 * After dropping the lock, there are four cases to
-			 * consider. See the following example for explanation.
-			 *
-			 *   +------+------+-----------+
-			 *   | VMA1 | VMA2 | VMA3      |
-			 *   +------+------+-----------+
-			 *   |      |      |           |
-			 *  4k     8k     16k         400k
-			 *
-			 * Suppose we drop the lock after reading VMA2 due to
-			 * contention, then we get:
-			 *
-			 *	last_vma_end = 16k
-			 *
-			 * 1) VMA2 is freed, but VMA3 exists:
-			 *
-			 *    vma_next(vmi) will return VMA3.
-			 *    In this case, just continue from VMA3.
-			 *
-			 * 2) VMA2 still exists:
-			 *
-			 *    vma_next(vmi) will return VMA3.
-			 *    In this case, just continue from VMA3.
-			 *
-			 * 3) No more VMAs can be found:
-			 *
-			 *    vma_next(vmi) will return NULL.
-			 *    No more things to do, just break.
-			 *
-			 * 4) (last_vma_end - 1) is the middle of a vma (VMA'):
-			 *
-			 *    vma_next(vmi) will return VMA' whose range
-			 *    contains last_vma_end.
-			 *    Iterate VMA' from last_vma_end.
-			 */
-			vma = vma_next(&vmi);
-			/* Case 3 above */
-			if (!vma)
-				break;
-
-			/* Case 1 and 2 above */
-			if (vma->vm_start >= last_vma_end) {
-				smap_gather_stats(priv, vma, &mss, 0);
-				last_vma_end = vma->vm_end;
-				continue;
-			}
 
-			/* Case 4 above */
-			if (vma->vm_end > last_vma_end) {
-				smap_gather_stats(priv, vma, &mss, last_vma_end);
-				last_vma_end = vma->vm_end;
-			}
+			/* Resume from the last position. */
+			pos = last_vma_end;
+			vma_iter_init(&priv->iter, mm, pos);
 		}
-	} for_each_vma(vmi, vma);
+		vma = proc_get_vma(m, &pos);
+	}
 
 empty_set:
 	show_vma_header_prefix(m, vma_start, last_vma_end, 0, 0, 0, 0);
@@ -1593,10 +1577,10 @@ static int show_smaps_rollup(struct seq_file *m, void *v)
 
 	__show_smap(m, &mss, true);
 
-	release_task_mempolicy(priv);
-	unlock_ctx_mm(&priv->lock_ctx);
-
+out_unlock:
+	unlock_vma_range(&priv->lock_ctx);
 out_put_mm:
+	release_task_mempolicy(priv);
 	mmput(mm);
 out_put_task:
 	put_task_struct(priv->task);
-- 
2.54.0.1032.g2f8565e1d1-goog


^ permalink raw reply related	[flat|nested] 17+ messages in thread

* Re: [PATCH 2/2] fs/proc/task_mmu: read proc/pid/smaps_rollup under per-vma lock
  2026-06-06  1:57 ` [PATCH 2/2] fs/proc/task_mmu: read proc/pid/smaps_rollup under per-vma lock Suren Baghdasaryan
@ 2026-06-06  2:00   ` Suren Baghdasaryan
  2026-06-06  8:12   ` Lorenzo Stoakes
                     ` (2 subsequent siblings)
  3 siblings, 0 replies; 17+ messages in thread
From: Suren Baghdasaryan @ 2026-06-06  2:00 UTC (permalink / raw)
  To: akpm
  Cc: liam, ljs, vbabka, david, willy, jannh, paulmck, pfalcato,
	linux-mm, linux-kernel, linux-fsdevel

On Fri, Jun 5, 2026 at 6:57 PM Suren Baghdasaryan <surenb@google.com> wrote:
>
> proc/pid/smaps_rollup can be read using the combination of RCU and
> VMA read locks, similar to proc/pid/{maps|smaps|numa_maps}. RCU is
> required to safely traverse the VMA tree and VMA lock stabilizes the
> VMA being processed and the pagetable walk.
> Note that we have to keep the logic to drop mmap_lock on contention
> because even when using per-VMA locks we might have to fall back to
> holding the mmap_lock.
>
> Running Paul's contention benchmark [1] shows considerable improvement
> both in median and in the worst case latencies:
>
> Execution command: run-proc-vs-map.sh --nsamples 20 --rawdata -- \
> --busyduration 2 --procfile smaps_rollup
>
> Baseline:

Forgot to add the names for the columns:

#  Median   Minimum   Maximum

>     0.174     0.161     2.553
>     0.174     0.164     2.663
>     0.174     0.165     2.664
>     0.174     0.166     2.679
>     0.174     0.167     2.691
>     0.174     0.168     2.704
>     0.174     0.169     2.729
>     0.174     0.172     2.741
>     0.174     0.174     2.745
>     0.174     0.174     2.755
>     0.174     0.175     2.790
>     0.174     0.177     2.809
>     0.174     0.179     3.096
>     0.174     0.183     3.144
>     0.174     0.184     3.158
>     0.174     0.185     3.175
>     0.174     0.185     4.568
>     0.174     0.198     4.821
>     0.174     0.214     5.143
>     0.174     0.251     5.220
>
> Patched:
>
>     0.007     0.007     1.952
>     0.007     0.007     1.955
>     0.007     0.007     1.955
>     0.007     0.007     1.955
>     0.007     0.007     1.957
>     0.007     0.007     1.969
>     0.007     0.007     2.065
>     0.007     0.007     2.075
>     0.007     0.007     2.146
>     0.007     0.007     2.195
>     0.007     0.007     2.223
>     0.007     0.007     2.259
>     0.007     0.007     2.488
>     0.007     0.007     2.562
>     0.007     0.007     2.599
>     0.007     0.007     2.697
>     0.007     0.007     3.030
>     0.007     0.007     3.075
>     0.007     0.007     3.145
>     0.007     0.007     3.225
>
> [1] https://github.com/paulmckrcu/proc-mmap_sem-test
>
> Signed-off-by: Suren Baghdasaryan <surenb@google.com>
> ---
>  fs/proc/task_mmu.c | 134 ++++++++++++++++++++-------------------------
>  1 file changed, 59 insertions(+), 75 deletions(-)
>
> diff --git a/fs/proc/task_mmu.c b/fs/proc/task_mmu.c
> index 023422fcee12..c2bd9f5bbbcd 100644
> --- a/fs/proc/task_mmu.c
> +++ b/fs/proc/task_mmu.c
> @@ -233,6 +233,16 @@ static inline void reacquire_rcu(struct proc_maps_private *priv)
>         vma_iter_set(&priv->iter, priv->lock_ctx.locked_vma->vm_end);
>  }
>
> +static inline bool is_mmap_lock_contended(struct proc_maps_private *priv)
> +{
> +       struct proc_maps_locking_ctx *lock_ctx = &priv->lock_ctx;
> +
> +       if (!lock_ctx->mmap_locked)
> +               return false;
> +
> +       return !!mmap_lock_is_contended(lock_ctx->mm);
> +}
> +
>  #else /* CONFIG_PER_VMA_LOCK */
>
>  static inline int lock_ctx_mm(struct proc_maps_locking_ctx *lock_ctx)
> @@ -268,6 +278,11 @@ static inline bool fallback_to_mmap_lock(struct proc_maps_private *priv,
>         return false;
>  }
>
> +static inline bool is_mmap_lock_contended(struct proc_maps_private *priv)
> +{
> +       return !!mmap_lock_is_contended(priv->lock_ctx.mm);
> +}
> +
>  static inline void drop_rcu(struct proc_maps_private *priv) {}
>  static inline void reacquire_rcu(struct proc_maps_private *priv) {}
>
> @@ -1486,12 +1501,15 @@ static int show_smap(struct seq_file *m, void *v)
>  static int show_smaps_rollup(struct seq_file *m, void *v)
>  {
>         struct proc_maps_private *priv = m->private;
> +       struct proc_maps_locking_ctx *lock_ctx = &priv->lock_ctx;
> +       struct mm_struct *mm = lock_ctx->mm;
>         struct mem_size_stats mss = {};
> -       struct mm_struct *mm = priv->lock_ctx.mm;
>         struct vm_area_struct *vma;
> -       unsigned long vma_start = 0, last_vma_end = 0;
> +       unsigned long vma_start = 0;
> +       unsigned long last_vma_end = 0;
> +       loff_t pos = 0;
>         int ret = 0;
> -       VMA_ITERATOR(vmi, mm, 0);
> +
>
>         priv->task = get_proc_task(priv->inode);
>         if (!priv->task)
> @@ -1502,89 +1520,55 @@ static int show_smaps_rollup(struct seq_file *m, void *v)
>                 goto out_put_task;
>         }
>
> -       ret = lock_ctx_mm(&priv->lock_ctx);
> +       hold_task_mempolicy(priv);
> +       ret = lock_vma_range(m, lock_ctx);
>         if (ret)
>                 goto out_put_mm;
>
> -       hold_task_mempolicy(priv);
> -       vma = vma_next(&vmi);
> -
> -       if (unlikely(!vma))
> +       vma_iter_init(&priv->iter, mm, 0);
> +       vma = proc_get_vma(m, &pos);
> +       if (unlikely(!vma) || vma == get_gate_vma(priv->lock_ctx.mm))
>                 goto empty_set;
>
> +       if (IS_ERR(vma)) {
> +               ret = PTR_ERR(vma);
> +               goto out_unlock;
> +       }
> +
>         vma_start = vma->vm_start;
> -       do {
> -               smap_gather_stats(priv, vma, &mss, 0);
> +       while (vma) {
> +               if (IS_ERR(vma)) {
> +                       ret = PTR_ERR(vma);
> +                       goto out_unlock;
> +               }
> +
> +               if (vma == get_gate_vma(priv->lock_ctx.mm))
> +                       break;
> +
> +               /*
> +                * If after retaking mmap_lock, already reported VMA grew or
> +                * merged with the next one, then iterate from last_vma_end.
> +                */
> +               smap_gather_stats(priv, vma, &mss,
> +                                 vma->vm_start < last_vma_end ? last_vma_end : 0);
>                 last_vma_end = vma->vm_end;
>
>                 /*
>                  * Release mmap_lock temporarily if someone wants to
> -                * access it for write request.
> +                * take it for write request.
>                  */
> -               if (mmap_lock_is_contended(mm)) {
> -                       vma_iter_invalidate(&vmi);
> -                       unlock_ctx_mm(&priv->lock_ctx);
> -                       ret = lock_ctx_mm(&priv->lock_ctx);
> -                       if (ret) {
> -                               release_task_mempolicy(priv);
> +               if (is_mmap_lock_contended(priv)) {
> +                       unlock_vma_range(&priv->lock_ctx);
> +                       ret = lock_vma_range(m, lock_ctx);
> +                       if (ret)
>                                 goto out_put_mm;
> -                       }
> -
> -                       /*
> -                        * After dropping the lock, there are four cases to
> -                        * consider. See the following example for explanation.
> -                        *
> -                        *   +------+------+-----------+
> -                        *   | VMA1 | VMA2 | VMA3      |
> -                        *   +------+------+-----------+
> -                        *   |      |      |           |
> -                        *  4k     8k     16k         400k
> -                        *
> -                        * Suppose we drop the lock after reading VMA2 due to
> -                        * contention, then we get:
> -                        *
> -                        *      last_vma_end = 16k
> -                        *
> -                        * 1) VMA2 is freed, but VMA3 exists:
> -                        *
> -                        *    vma_next(vmi) will return VMA3.
> -                        *    In this case, just continue from VMA3.
> -                        *
> -                        * 2) VMA2 still exists:
> -                        *
> -                        *    vma_next(vmi) will return VMA3.
> -                        *    In this case, just continue from VMA3.
> -                        *
> -                        * 3) No more VMAs can be found:
> -                        *
> -                        *    vma_next(vmi) will return NULL.
> -                        *    No more things to do, just break.
> -                        *
> -                        * 4) (last_vma_end - 1) is the middle of a vma (VMA'):
> -                        *
> -                        *    vma_next(vmi) will return VMA' whose range
> -                        *    contains last_vma_end.
> -                        *    Iterate VMA' from last_vma_end.
> -                        */
> -                       vma = vma_next(&vmi);
> -                       /* Case 3 above */
> -                       if (!vma)
> -                               break;
> -
> -                       /* Case 1 and 2 above */
> -                       if (vma->vm_start >= last_vma_end) {
> -                               smap_gather_stats(priv, vma, &mss, 0);
> -                               last_vma_end = vma->vm_end;
> -                               continue;
> -                       }
>
> -                       /* Case 4 above */
> -                       if (vma->vm_end > last_vma_end) {
> -                               smap_gather_stats(priv, vma, &mss, last_vma_end);
> -                               last_vma_end = vma->vm_end;
> -                       }
> +                       /* Resume from the last position. */
> +                       pos = last_vma_end;
> +                       vma_iter_init(&priv->iter, mm, pos);
>                 }
> -       } for_each_vma(vmi, vma);
> +               vma = proc_get_vma(m, &pos);
> +       }
>
>  empty_set:
>         show_vma_header_prefix(m, vma_start, last_vma_end, 0, 0, 0, 0);
> @@ -1593,10 +1577,10 @@ static int show_smaps_rollup(struct seq_file *m, void *v)
>
>         __show_smap(m, &mss, true);
>
> -       release_task_mempolicy(priv);
> -       unlock_ctx_mm(&priv->lock_ctx);
> -
> +out_unlock:
> +       unlock_vma_range(&priv->lock_ctx);
>  out_put_mm:
> +       release_task_mempolicy(priv);
>         mmput(mm);
>  out_put_task:
>         put_task_struct(priv->task);
> --
> 2.54.0.1032.g2f8565e1d1-goog
>


^ permalink raw reply	[flat|nested] 17+ messages in thread

* Re: [PATCH 2/2] fs/proc/task_mmu: read proc/pid/smaps_rollup under per-vma lock
  2026-06-06  1:57 ` [PATCH 2/2] fs/proc/task_mmu: read proc/pid/smaps_rollup under per-vma lock Suren Baghdasaryan
  2026-06-06  2:00   ` Suren Baghdasaryan
@ 2026-06-06  8:12   ` Lorenzo Stoakes
  2026-06-07 19:45     ` Suren Baghdasaryan
  2026-06-08 15:52   ` David Hildenbrand (Arm)
  2026-06-09 10:00   ` Lorenzo Stoakes
  3 siblings, 1 reply; 17+ messages in thread
From: Lorenzo Stoakes @ 2026-06-06  8:12 UTC (permalink / raw)
  To: Suren Baghdasaryan
  Cc: akpm, liam, vbabka, david, willy, jannh, paulmck, pfalcato,
	linux-mm, linux-kernel, linux-fsdevel

(Will review separately)

Sorry to be a pain :) and I know it's a petty thing, but even for a 2 patch
series it's easier to handle if sent with patches in-reply-to a cover
letter :>) also makes it easier to track overall updates in cover letter
changelog.

Thanks, Lorenzo


^ permalink raw reply	[flat|nested] 17+ messages in thread

* Re: [PATCH 2/2] fs/proc/task_mmu: read proc/pid/smaps_rollup under per-vma lock
  2026-06-06  8:12   ` Lorenzo Stoakes
@ 2026-06-07 19:45     ` Suren Baghdasaryan
  2026-06-08  8:14       ` Lorenzo Stoakes
  0 siblings, 1 reply; 17+ messages in thread
From: Suren Baghdasaryan @ 2026-06-07 19:45 UTC (permalink / raw)
  To: Lorenzo Stoakes
  Cc: akpm, liam, vbabka, david, willy, jannh, paulmck, pfalcato,
	linux-mm, linux-kernel, linux-fsdevel

On Sat, Jun 6, 2026 at 1:12 AM Lorenzo Stoakes <ljs@kernel.org> wrote:
>
> (Will review separately)
>
> Sorry to be a pain :) and I know it's a petty thing, but even for a 2 patch
> series it's easier to handle if sent with patches in-reply-to a cover
> letter :>) also makes it easier to track overall updates in cover letter
> changelog.

Yeah but the first patch is a simple cleanup of about 10 lines, and
writing a cover-letter for a single patch seemed excessive. Andrew
would have folded it in with this patch anyway, I think.


>
> Thanks, Lorenzo


^ permalink raw reply	[flat|nested] 17+ messages in thread

* Re: [PATCH 2/2] fs/proc/task_mmu: read proc/pid/smaps_rollup under per-vma lock
  2026-06-07 19:45     ` Suren Baghdasaryan
@ 2026-06-08  8:14       ` Lorenzo Stoakes
  2026-06-08 15:32         ` David Hildenbrand (Arm)
  2026-06-08 15:43         ` Suren Baghdasaryan
  0 siblings, 2 replies; 17+ messages in thread
From: Lorenzo Stoakes @ 2026-06-08  8:14 UTC (permalink / raw)
  To: Suren Baghdasaryan
  Cc: akpm, liam, vbabka, david, willy, jannh, paulmck, pfalcato,
	linux-mm, linux-kernel, linux-fsdevel

On Sun, Jun 07, 2026 at 12:45:23PM -0700, Suren Baghdasaryan wrote:
> On Sat, Jun 6, 2026 at 1:12 AM Lorenzo Stoakes <ljs@kernel.org> wrote:
> >
> > (Will review separately)
> >
> > Sorry to be a pain :) and I know it's a petty thing, but even for a 2 patch
> > series it's easier to handle if sent with patches in-reply-to a cover
> > letter :>) also makes it easier to track overall updates in cover letter
> > changelog.
>
> Yeah but the first patch is a simple cleanup of about 10 lines, and
> writing a cover-letter for a single patch seemed excessive. Andrew
> would have folded it in with this patch anyway, I think.

Makes sense, sorry to nag but I want to be consistent in nagging everybody
the same :P no matter how well I know them or otherwise ;)

I will take a look through the patches properly in a bit, I meant to
actually do that on Sat but got distracted...!

>
>
> >
> > Thanks, Lorenzo

Cheers, Lorenzo


^ permalink raw reply	[flat|nested] 17+ messages in thread

* Re: [PATCH 2/2] fs/proc/task_mmu: read proc/pid/smaps_rollup under per-vma lock
  2026-06-08  8:14       ` Lorenzo Stoakes
@ 2026-06-08 15:32         ` David Hildenbrand (Arm)
  2026-06-08 15:44           ` Suren Baghdasaryan
  2026-06-08 15:43         ` Suren Baghdasaryan
  1 sibling, 1 reply; 17+ messages in thread
From: David Hildenbrand (Arm) @ 2026-06-08 15:32 UTC (permalink / raw)
  To: Lorenzo Stoakes, Suren Baghdasaryan
  Cc: akpm, liam, vbabka, willy, jannh, paulmck, pfalcato, linux-mm,
	linux-kernel, linux-fsdevel

On 6/8/26 10:14, Lorenzo Stoakes wrote:
> On Sun, Jun 07, 2026 at 12:45:23PM -0700, Suren Baghdasaryan wrote:
>> On Sat, Jun 6, 2026 at 1:12 AM Lorenzo Stoakes <ljs@kernel.org> wrote:
>>>
>>> (Will review separately)
>>>
>>> Sorry to be a pain :) and I know it's a petty thing, but even for a 2 patch
>>> series it's easier to handle if sent with patches in-reply-to a cover
>>> letter :>) also makes it easier to track overall updates in cover letter
>>> changelog.
>>
>> Yeah but the first patch is a simple cleanup of about 10 lines, and
>> writing a cover-letter for a single patch seemed excessive. Andrew
>> would have folded it in with this patch anyway, I think.
> 
> Makes sense, sorry to nag but I want to be consistent in nagging everybody
> the same :P no matter how well I know them or otherwise ;)

I mean, it always looks like there is something missing in my inbox (e.g.,
people CCing on patches but not on the cover letter). So then I dig for a
missing cover letter and find ... nothing.

-- 
Cheers,

David


^ permalink raw reply	[flat|nested] 17+ messages in thread

* Re: [PATCH 1/2] fs/proc/task_mmu: change lock_vma_range() to return error code
  2026-06-06  1:57 [PATCH 1/2] fs/proc/task_mmu: change lock_vma_range() to return error code Suren Baghdasaryan
  2026-06-06  1:57 ` [PATCH 2/2] fs/proc/task_mmu: read proc/pid/smaps_rollup under per-vma lock Suren Baghdasaryan
@ 2026-06-08 15:38 ` David Hildenbrand (Arm)
  2026-06-08 15:43   ` Suren Baghdasaryan
  2026-06-09  8:27 ` Lorenzo Stoakes
  2 siblings, 1 reply; 17+ messages in thread
From: David Hildenbrand (Arm) @ 2026-06-08 15:38 UTC (permalink / raw)
  To: Suren Baghdasaryan, akpm
  Cc: liam, ljs, vbabka, willy, jannh, paulmck, pfalcato, linux-mm,
	linux-kernel, linux-fsdevel

On 6/6/26 03:57, Suren Baghdasaryan wrote:
> To correctly propagate error code from lock_vma_range(), change it to
> return the error code instead of the boolean. This simplifies error
> propagation code.
> 
> Signed-off-by: Suren Baghdasaryan <surenb@google.com>
> ---
>  fs/proc/task_mmu.c | 16 +++++++++-------
>  1 file changed, 9 insertions(+), 7 deletions(-)
> 
> diff --git a/fs/proc/task_mmu.c b/fs/proc/task_mmu.c
> index d32408f7cd5e..023422fcee12 100644
> --- a/fs/proc/task_mmu.c
> +++ b/fs/proc/task_mmu.c
> @@ -162,13 +162,13 @@ static void unlock_ctx_vma(struct proc_maps_locking_ctx *lock_ctx)
>  	}
>  }
>  
> -static inline bool lock_vma_range(struct seq_file *m,
> -				  struct proc_maps_locking_ctx *lock_ctx)
> +static inline int lock_vma_range(struct seq_file *m,
> +				 struct proc_maps_locking_ctx *lock_ctx)
>  {
>  	rcu_read_lock();
>  	reset_lock_ctx(lock_ctx);
>  
> -	return true;
> +	return 0;
>  }
>  
>  static inline void unlock_vma_range(struct proc_maps_locking_ctx *lock_ctx)
> @@ -245,10 +245,10 @@ static inline void unlock_ctx_mm(struct proc_maps_locking_ctx *lock_ctx)
>  	mmap_read_unlock(lock_ctx->mm);
>  }
>  
> -static inline bool lock_vma_range(struct seq_file *m,
> +static inline int lock_vma_range(struct seq_file *m,
>  				  struct proc_maps_locking_ctx *lock_ctx)
>  {
> -	return lock_ctx_mm(lock_ctx) == 0;
> +	return lock_ctx_mm(lock_ctx);
>  }
>  
>  static inline void unlock_vma_range(struct proc_maps_locking_ctx *lock_ctx)
> @@ -311,6 +311,7 @@ static void *m_start(struct seq_file *m, loff_t *ppos)
>  	struct proc_maps_locking_ctx *lock_ctx;
>  	loff_t last_addr = *ppos;
>  	struct mm_struct *mm;
> +	int err;
>  
>  	/* See m_next(). Zero at the start or after lseek. */
>  	if (last_addr == SENTINEL_VMA_END)
> @@ -328,11 +329,12 @@ static void *m_start(struct seq_file *m, loff_t *ppos)
>  		return NULL;
>  	}
>  
> -	if (!lock_vma_range(m, lock_ctx)) {
> +	err = lock_vma_range(m, lock_ctx);
> +	if (err) {
>  		mmput(mm);
>  		put_task_struct(priv->task);
>  		priv->task = NULL;
> -		return ERR_PTR(-EINTR);
> +		return ERR_PTR(err);

Looks good: down_read_killable() would also have returned -EINTR.

Acked-by: David Hildenbrand (Arm) <david@kernel.org>

-- 
Cheers,

David


^ permalink raw reply	[flat|nested] 17+ messages in thread

* Re: [PATCH 1/2] fs/proc/task_mmu: change lock_vma_range() to return error code
  2026-06-08 15:38 ` [PATCH 1/2] fs/proc/task_mmu: change lock_vma_range() to return error code David Hildenbrand (Arm)
@ 2026-06-08 15:43   ` Suren Baghdasaryan
  0 siblings, 0 replies; 17+ messages in thread
From: Suren Baghdasaryan @ 2026-06-08 15:43 UTC (permalink / raw)
  To: David Hildenbrand (Arm)
  Cc: akpm, liam, ljs, vbabka, willy, jannh, paulmck, pfalcato,
	linux-mm, linux-kernel, linux-fsdevel

On Mon, Jun 8, 2026 at 8:38 AM David Hildenbrand (Arm) <david@kernel.org> wrote:
>
> On 6/6/26 03:57, Suren Baghdasaryan wrote:
> > To correctly propagate error code from lock_vma_range(), change it to
> > return the error code instead of the boolean. This simplifies error
> > propagation code.
> >
> > Signed-off-by: Suren Baghdasaryan <surenb@google.com>
> > ---
> >  fs/proc/task_mmu.c | 16 +++++++++-------
> >  1 file changed, 9 insertions(+), 7 deletions(-)
> >
> > diff --git a/fs/proc/task_mmu.c b/fs/proc/task_mmu.c
> > index d32408f7cd5e..023422fcee12 100644
> > --- a/fs/proc/task_mmu.c
> > +++ b/fs/proc/task_mmu.c
> > @@ -162,13 +162,13 @@ static void unlock_ctx_vma(struct proc_maps_locking_ctx *lock_ctx)
> >       }
> >  }
> >
> > -static inline bool lock_vma_range(struct seq_file *m,
> > -                               struct proc_maps_locking_ctx *lock_ctx)
> > +static inline int lock_vma_range(struct seq_file *m,
> > +                              struct proc_maps_locking_ctx *lock_ctx)
> >  {
> >       rcu_read_lock();
> >       reset_lock_ctx(lock_ctx);
> >
> > -     return true;
> > +     return 0;
> >  }
> >
> >  static inline void unlock_vma_range(struct proc_maps_locking_ctx *lock_ctx)
> > @@ -245,10 +245,10 @@ static inline void unlock_ctx_mm(struct proc_maps_locking_ctx *lock_ctx)
> >       mmap_read_unlock(lock_ctx->mm);
> >  }
> >
> > -static inline bool lock_vma_range(struct seq_file *m,
> > +static inline int lock_vma_range(struct seq_file *m,
> >                                 struct proc_maps_locking_ctx *lock_ctx)
> >  {
> > -     return lock_ctx_mm(lock_ctx) == 0;
> > +     return lock_ctx_mm(lock_ctx);
> >  }
> >
> >  static inline void unlock_vma_range(struct proc_maps_locking_ctx *lock_ctx)
> > @@ -311,6 +311,7 @@ static void *m_start(struct seq_file *m, loff_t *ppos)
> >       struct proc_maps_locking_ctx *lock_ctx;
> >       loff_t last_addr = *ppos;
> >       struct mm_struct *mm;
> > +     int err;
> >
> >       /* See m_next(). Zero at the start or after lseek. */
> >       if (last_addr == SENTINEL_VMA_END)
> > @@ -328,11 +329,12 @@ static void *m_start(struct seq_file *m, loff_t *ppos)
> >               return NULL;
> >       }
> >
> > -     if (!lock_vma_range(m, lock_ctx)) {
> > +     err = lock_vma_range(m, lock_ctx);
> > +     if (err) {
> >               mmput(mm);
> >               put_task_struct(priv->task);
> >               priv->task = NULL;
> > -             return ERR_PTR(-EINTR);
> > +             return ERR_PTR(err);
>
> Looks good: down_read_killable() would also have returned -EINTR.

Yes. I also use lock_vma_range() in the next patch and this change
makes the code a bit more cleaner.

>
> Acked-by: David Hildenbrand (Arm) <david@kernel.org>

Thanks!

>
> --
> Cheers,
>
> David


^ permalink raw reply	[flat|nested] 17+ messages in thread

* Re: [PATCH 2/2] fs/proc/task_mmu: read proc/pid/smaps_rollup under per-vma lock
  2026-06-08  8:14       ` Lorenzo Stoakes
  2026-06-08 15:32         ` David Hildenbrand (Arm)
@ 2026-06-08 15:43         ` Suren Baghdasaryan
  1 sibling, 0 replies; 17+ messages in thread
From: Suren Baghdasaryan @ 2026-06-08 15:43 UTC (permalink / raw)
  To: Lorenzo Stoakes
  Cc: akpm, liam, vbabka, david, willy, jannh, paulmck, pfalcato,
	linux-mm, linux-kernel, linux-fsdevel

On Mon, Jun 8, 2026 at 1:14 AM Lorenzo Stoakes <ljs@kernel.org> wrote:
>
> On Sun, Jun 07, 2026 at 12:45:23PM -0700, Suren Baghdasaryan wrote:
> > On Sat, Jun 6, 2026 at 1:12 AM Lorenzo Stoakes <ljs@kernel.org> wrote:
> > >
> > > (Will review separately)
> > >
> > > Sorry to be a pain :) and I know it's a petty thing, but even for a 2 patch
> > > series it's easier to handle if sent with patches in-reply-to a cover
> > > letter :>) also makes it easier to track overall updates in cover letter
> > > changelog.
> >
> > Yeah but the first patch is a simple cleanup of about 10 lines, and
> > writing a cover-letter for a single patch seemed excessive. Andrew
> > would have folded it in with this patch anyway, I think.
>
> Makes sense, sorry to nag but I want to be consistent in nagging everybody
> the same :P no matter how well I know them or otherwise ;)
>
> I will take a look through the patches properly in a bit, I meant to
> actually do that on Sat but got distracted...!

Thanks!

>
> >
> >
> > >
> > > Thanks, Lorenzo
>
> Cheers, Lorenzo


^ permalink raw reply	[flat|nested] 17+ messages in thread

* Re: [PATCH 2/2] fs/proc/task_mmu: read proc/pid/smaps_rollup under per-vma lock
  2026-06-08 15:32         ` David Hildenbrand (Arm)
@ 2026-06-08 15:44           ` Suren Baghdasaryan
  0 siblings, 0 replies; 17+ messages in thread
From: Suren Baghdasaryan @ 2026-06-08 15:44 UTC (permalink / raw)
  To: David Hildenbrand (Arm)
  Cc: Lorenzo Stoakes, akpm, liam, vbabka, willy, jannh, paulmck,
	pfalcato, linux-mm, linux-kernel, linux-fsdevel

On Mon, Jun 8, 2026 at 8:32 AM David Hildenbrand (Arm) <david@kernel.org> wrote:
>
> On 6/8/26 10:14, Lorenzo Stoakes wrote:
> > On Sun, Jun 07, 2026 at 12:45:23PM -0700, Suren Baghdasaryan wrote:
> >> On Sat, Jun 6, 2026 at 1:12 AM Lorenzo Stoakes <ljs@kernel.org> wrote:
> >>>
> >>> (Will review separately)
> >>>
> >>> Sorry to be a pain :) and I know it's a petty thing, but even for a 2 patch
> >>> series it's easier to handle if sent with patches in-reply-to a cover
> >>> letter :>) also makes it easier to track overall updates in cover letter
> >>> changelog.
> >>
> >> Yeah but the first patch is a simple cleanup of about 10 lines, and
> >> writing a cover-letter for a single patch seemed excessive. Andrew
> >> would have folded it in with this patch anyway, I think.
> >
> > Makes sense, sorry to nag but I want to be consistent in nagging everybody
> > the same :P no matter how well I know them or otherwise ;)
>
> I mean, it always looks like there is something missing in my inbox (e.g.,
> people CCing on patches but not on the cover letter). So then I dig for a
> missing cover letter and find ... nothing.

Ok, I'm convinced. I will add a cover letter if I get to respin this
patchset. Thanks!

>
> --
> Cheers,
>
> David


^ permalink raw reply	[flat|nested] 17+ messages in thread

* Re: [PATCH 2/2] fs/proc/task_mmu: read proc/pid/smaps_rollup under per-vma lock
  2026-06-06  1:57 ` [PATCH 2/2] fs/proc/task_mmu: read proc/pid/smaps_rollup under per-vma lock Suren Baghdasaryan
  2026-06-06  2:00   ` Suren Baghdasaryan
  2026-06-06  8:12   ` Lorenzo Stoakes
@ 2026-06-08 15:52   ` David Hildenbrand (Arm)
  2026-06-08 16:20     ` Suren Baghdasaryan
  2026-06-09 10:00   ` Lorenzo Stoakes
  3 siblings, 1 reply; 17+ messages in thread
From: David Hildenbrand (Arm) @ 2026-06-08 15:52 UTC (permalink / raw)
  To: Suren Baghdasaryan, akpm
  Cc: liam, ljs, vbabka, willy, jannh, paulmck, pfalcato, linux-mm,
	linux-kernel, linux-fsdevel

On 6/6/26 03:57, Suren Baghdasaryan wrote:
> proc/pid/smaps_rollup can be read using the combination of RCU and
> VMA read locks, similar to proc/pid/{maps|smaps|numa_maps}. RCU is
> required to safely traverse the VMA tree and VMA lock stabilizes the
> VMA being processed and the pagetable walk.
> Note that we have to keep the logic to drop mmap_lock on contention
> because even when using per-VMA locks we might have to fall back to
> holding the mmap_lock.
> 
> Running Paul's contention benchmark [1] shows considerable improvement
> both in median and in the worst case latencies:
> 
> Execution command: run-proc-vs-map.sh --nsamples 20 --rawdata -- \
> --busyduration 2 --procfile smaps_rollup
> 
> Baseline:
> 
>     0.174     0.161     2.553
>     0.174     0.164     2.663
>     0.174     0.165     2.664
>     0.174     0.166     2.679
>     0.174     0.167     2.691
>     0.174     0.168     2.704
>     0.174     0.169     2.729
>     0.174     0.172     2.741
>     0.174     0.174     2.745
>     0.174     0.174     2.755
>     0.174     0.175     2.790
>     0.174     0.177     2.809
>     0.174     0.179     3.096
>     0.174     0.183     3.144
>     0.174     0.184     3.158
>     0.174     0.185     3.175
>     0.174     0.185     4.568
>     0.174     0.198     4.821
>     0.174     0.214     5.143
>     0.174     0.251     5.220
> 
> Patched:
> 
>     0.007     0.007     1.952
>     0.007     0.007     1.955
>     0.007     0.007     1.955
>     0.007     0.007     1.955
>     0.007     0.007     1.957
>     0.007     0.007     1.969
>     0.007     0.007     2.065
>     0.007     0.007     2.075
>     0.007     0.007     2.146
>     0.007     0.007     2.195
>     0.007     0.007     2.223
>     0.007     0.007     2.259
>     0.007     0.007     2.488
>     0.007     0.007     2.562
>     0.007     0.007     2.599
>     0.007     0.007     2.697
>     0.007     0.007     3.030
>     0.007     0.007     3.075
>     0.007     0.007     3.145
>     0.007     0.007     3.225
> 
> [1] https://github.com/paulmckrcu/proc-mmap_sem-test
> 
> Signed-off-by: Suren Baghdasaryan <surenb@google.com>
> ---
>  fs/proc/task_mmu.c | 134 ++++++++++++++++++++-------------------------
>  1 file changed, 59 insertions(+), 75 deletions(-)
> 
> diff --git a/fs/proc/task_mmu.c b/fs/proc/task_mmu.c
> index 023422fcee12..c2bd9f5bbbcd 100644
> --- a/fs/proc/task_mmu.c
> +++ b/fs/proc/task_mmu.c
> @@ -233,6 +233,16 @@ static inline void reacquire_rcu(struct proc_maps_private *priv)
>  	vma_iter_set(&priv->iter, priv->lock_ctx.locked_vma->vm_end);
>  }
>  
> +static inline bool is_mmap_lock_contended(struct proc_maps_private *priv)

is_mmap_lock_contended() vs. mmap_lock_is_contended() ... really nasty.

> +{
> +	struct proc_maps_locking_ctx *lock_ctx = &priv->lock_ctx;
> +
> +	if (!lock_ctx->mmap_locked)
> +		return false;
> +
> +	return !!mmap_lock_is_contended(lock_ctx->mm);

!! is not required anymore, the compiler nowadays knows how to handle booleans
correctly.

> +}
> +
>  #else /* CONFIG_PER_VMA_LOCK */
>  
>  static inline int lock_ctx_mm(struct proc_maps_locking_ctx *lock_ctx)
> @@ -268,6 +278,11 @@ static inline bool fallback_to_mmap_lock(struct proc_maps_private *priv,
>  	return false;
>  }
>  
> +static inline bool is_mmap_lock_contended(struct proc_maps_private *priv)
> +{
> +	return !!mmap_lock_is_contended(priv->lock_ctx.mm);
> +}
> +
>  static inline void drop_rcu(struct proc_maps_private *priv) {}
>  static inline void reacquire_rcu(struct proc_maps_private *priv) {}
>  
> @@ -1486,12 +1501,15 @@ static int show_smap(struct seq_file *m, void *v)
>  static int show_smaps_rollup(struct seq_file *m, void *v)
>  {
>  	struct proc_maps_private *priv = m->private;
> +	struct proc_maps_locking_ctx *lock_ctx = &priv->lock_ctx;
> +	struct mm_struct *mm = lock_ctx->mm;
>  	struct mem_size_stats mss = {};
> -	struct mm_struct *mm = priv->lock_ctx.mm;
>  	struct vm_area_struct *vma;
> -	unsigned long vma_start = 0, last_vma_end = 0;
> +	unsigned long vma_start = 0;
> +	unsigned long last_vma_end = 0;
> +	loff_t pos = 0;
>  	int ret = 0;
> -	VMA_ITERATOR(vmi, mm, 0);
> +
>  

Is there now a double-empty line? (it's getting late here)

>  	priv->task = get_proc_task(priv->inode);
>  	if (!priv->task)
> @@ -1502,89 +1520,55 @@ static int show_smaps_rollup(struct seq_file *m, void *v)
>  		goto out_put_task;
>  	}
>  
> -	ret = lock_ctx_mm(&priv->lock_ctx);
> +	hold_task_mempolicy(priv);
> +	ret = lock_vma_range(m, lock_ctx);
>  	if (ret)
>  		goto out_put_mm;
>  
> -	hold_task_mempolicy(priv);
> -	vma = vma_next(&vmi);
> -
> -	if (unlikely(!vma))
> +	vma_iter_init(&priv->iter, mm, 0);
> +	vma = proc_get_vma(m, &pos);
> +	if (unlikely(!vma) || vma == get_gate_vma(priv->lock_ctx.mm))
>  		goto empty_set;
>  
> +	if (IS_ERR(vma)) {
> +		ret = PTR_ERR(vma);
> +		goto out_unlock;
> +	}
> +
>  	vma_start = vma->vm_start;
> -	do {
> -		smap_gather_stats(priv, vma, &mss, 0);
> +	while (vma) {
> +		if (IS_ERR(vma)) {
> +			ret = PTR_ERR(vma);
> +			goto out_unlock;
> +		}
> +
> +		if (vma == get_gate_vma(priv->lock_ctx.mm))
> +			break;
> +

Can you refresh my brain why we now have to check for gate VMAs explicitly?

> +		/*
> +		 * If after retaking mmap_lock, already reported VMA grew or
> +		 * merged with the next one, then iterate from last_vma_end.
> +		 */
> +		smap_gather_stats(priv, vma, &mss,
> +				  vma->vm_start < last_vma_end ? last_vma_end : 0);
>  		last_vma_end = vma->vm_end;
>  
>  		/*
>  		 * Release mmap_lock temporarily if someone wants to
> -		 * access it for write request.
> +		 * take it for write request.
>  		 */
> -		if (mmap_lock_is_contended(mm)) {
> -			vma_iter_invalidate(&vmi);
> -			unlock_ctx_mm(&priv->lock_ctx);
> -			ret = lock_ctx_mm(&priv->lock_ctx);
> -			if (ret) {
> -				release_task_mempolicy(priv);
> +		if (is_mmap_lock_contended(priv)) {
> +			unlock_vma_range(&priv->lock_ctx);
> +			ret = lock_vma_range(m, lock_ctx);
> +			if (ret)
>  				goto out_put_mm;
> -			}
> -
> -			/*
> -			 * After dropping the lock, there are four cases to
> -			 * consider. See the following example for explanation.
> -			 *
> -			 *   +------+------+-----------+
> -			 *   | VMA1 | VMA2 | VMA3      |
> -			 *   +------+------+-----------+
> -			 *   |      |      |           |
> -			 *  4k     8k     16k         400k
> -			 *
> -			 * Suppose we drop the lock after reading VMA2 due to
> -			 * contention, then we get:
> -			 *
> -			 *	last_vma_end = 16k
> -			 *
> -			 * 1) VMA2 is freed, but VMA3 exists:
> -			 *
> -			 *    vma_next(vmi) will return VMA3.
> -			 *    In this case, just continue from VMA3.
> -			 *
> -			 * 2) VMA2 still exists:
> -			 *
> -			 *    vma_next(vmi) will return VMA3.
> -			 *    In this case, just continue from VMA3.
> -			 *
> -			 * 3) No more VMAs can be found:
> -			 *
> -			 *    vma_next(vmi) will return NULL.
> -			 *    No more things to do, just break.
> -			 *
> -			 * 4) (last_vma_end - 1) is the middle of a vma (VMA'):
> -			 *
> -			 *    vma_next(vmi) will return VMA' whose range
> -			 *    contains last_vma_end.
> -			 *    Iterate VMA' from last_vma_end.
> -			 */
> -			vma = vma_next(&vmi);
> -			/* Case 3 above */
> -			if (!vma)
> -				break;
> -
> -			/* Case 1 and 2 above */
> -			if (vma->vm_start >= last_vma_end) {
> -				smap_gather_stats(priv, vma, &mss, 0);
> -				last_vma_end = vma->vm_end;
> -				continue;
> -			}
>  
> -			/* Case 4 above */
> -			if (vma->vm_end > last_vma_end) {
> -				smap_gather_stats(priv, vma, &mss, last_vma_end);
> -				last_vma_end = vma->vm_end;
> -			}

That's quite the simplification.

> +			/* Resume from the last position. */
> +			pos = last_vma_end;
> +			vma_iter_init(&priv->iter, mm, pos);

I'll leave checking the VMA locking details to VMA locking experts :)

-- 
Cheers,

David


^ permalink raw reply	[flat|nested] 17+ messages in thread

* Re: [PATCH 2/2] fs/proc/task_mmu: read proc/pid/smaps_rollup under per-vma lock
  2026-06-08 15:52   ` David Hildenbrand (Arm)
@ 2026-06-08 16:20     ` Suren Baghdasaryan
  0 siblings, 0 replies; 17+ messages in thread
From: Suren Baghdasaryan @ 2026-06-08 16:20 UTC (permalink / raw)
  To: David Hildenbrand (Arm)
  Cc: akpm, liam, ljs, vbabka, willy, jannh, paulmck, pfalcato,
	linux-mm, linux-kernel, linux-fsdevel

On Mon, Jun 8, 2026 at 8:52 AM David Hildenbrand (Arm) <david@kernel.org> wrote:
>
> On 6/6/26 03:57, Suren Baghdasaryan wrote:
> > proc/pid/smaps_rollup can be read using the combination of RCU and
> > VMA read locks, similar to proc/pid/{maps|smaps|numa_maps}. RCU is
> > required to safely traverse the VMA tree and VMA lock stabilizes the
> > VMA being processed and the pagetable walk.
> > Note that we have to keep the logic to drop mmap_lock on contention
> > because even when using per-VMA locks we might have to fall back to
> > holding the mmap_lock.
> >
> > Running Paul's contention benchmark [1] shows considerable improvement
> > both in median and in the worst case latencies:
> >
> > Execution command: run-proc-vs-map.sh --nsamples 20 --rawdata -- \
> > --busyduration 2 --procfile smaps_rollup
> >
> > Baseline:
> >
> >     0.174     0.161     2.553
> >     0.174     0.164     2.663
> >     0.174     0.165     2.664
> >     0.174     0.166     2.679
> >     0.174     0.167     2.691
> >     0.174     0.168     2.704
> >     0.174     0.169     2.729
> >     0.174     0.172     2.741
> >     0.174     0.174     2.745
> >     0.174     0.174     2.755
> >     0.174     0.175     2.790
> >     0.174     0.177     2.809
> >     0.174     0.179     3.096
> >     0.174     0.183     3.144
> >     0.174     0.184     3.158
> >     0.174     0.185     3.175
> >     0.174     0.185     4.568
> >     0.174     0.198     4.821
> >     0.174     0.214     5.143
> >     0.174     0.251     5.220
> >
> > Patched:
> >
> >     0.007     0.007     1.952
> >     0.007     0.007     1.955
> >     0.007     0.007     1.955
> >     0.007     0.007     1.955
> >     0.007     0.007     1.957
> >     0.007     0.007     1.969
> >     0.007     0.007     2.065
> >     0.007     0.007     2.075
> >     0.007     0.007     2.146
> >     0.007     0.007     2.195
> >     0.007     0.007     2.223
> >     0.007     0.007     2.259
> >     0.007     0.007     2.488
> >     0.007     0.007     2.562
> >     0.007     0.007     2.599
> >     0.007     0.007     2.697
> >     0.007     0.007     3.030
> >     0.007     0.007     3.075
> >     0.007     0.007     3.145
> >     0.007     0.007     3.225
> >
> > [1] https://github.com/paulmckrcu/proc-mmap_sem-test
> >
> > Signed-off-by: Suren Baghdasaryan <surenb@google.com>
> > ---
> >  fs/proc/task_mmu.c | 134 ++++++++++++++++++++-------------------------
> >  1 file changed, 59 insertions(+), 75 deletions(-)
> >
> > diff --git a/fs/proc/task_mmu.c b/fs/proc/task_mmu.c
> > index 023422fcee12..c2bd9f5bbbcd 100644
> > --- a/fs/proc/task_mmu.c
> > +++ b/fs/proc/task_mmu.c
> > @@ -233,6 +233,16 @@ static inline void reacquire_rcu(struct proc_maps_private *priv)
> >       vma_iter_set(&priv->iter, priv->lock_ctx.locked_vma->vm_end);
> >  }
> >
> > +static inline bool is_mmap_lock_contended(struct proc_maps_private *priv)
>
> is_mmap_lock_contended() vs. mmap_lock_is_contended() ... really nasty.

Agree. Once Dave Hansen's cleanup patchset is posted we will be able
to remove most of this nastiness. Maybe I should wait for it before
posting this patch.


>
> > +{
> > +     struct proc_maps_locking_ctx *lock_ctx = &priv->lock_ctx;
> > +
> > +     if (!lock_ctx->mmap_locked)
> > +             return false;
> > +
> > +     return !!mmap_lock_is_contended(lock_ctx->mm);
>
> !! is not required anymore, the compiler nowadays knows how to handle booleans
> correctly.

Ack.

>
> > +}
> > +
> >  #else /* CONFIG_PER_VMA_LOCK */
> >
> >  static inline int lock_ctx_mm(struct proc_maps_locking_ctx *lock_ctx)
> > @@ -268,6 +278,11 @@ static inline bool fallback_to_mmap_lock(struct proc_maps_private *priv,
> >       return false;
> >  }
> >
> > +static inline bool is_mmap_lock_contended(struct proc_maps_private *priv)
> > +{
> > +     return !!mmap_lock_is_contended(priv->lock_ctx.mm);
> > +}
> > +
> >  static inline void drop_rcu(struct proc_maps_private *priv) {}
> >  static inline void reacquire_rcu(struct proc_maps_private *priv) {}
> >
> > @@ -1486,12 +1501,15 @@ static int show_smap(struct seq_file *m, void *v)
> >  static int show_smaps_rollup(struct seq_file *m, void *v)
> >  {
> >       struct proc_maps_private *priv = m->private;
> > +     struct proc_maps_locking_ctx *lock_ctx = &priv->lock_ctx;
> > +     struct mm_struct *mm = lock_ctx->mm;
> >       struct mem_size_stats mss = {};
> > -     struct mm_struct *mm = priv->lock_ctx.mm;
> >       struct vm_area_struct *vma;
> > -     unsigned long vma_start = 0, last_vma_end = 0;
> > +     unsigned long vma_start = 0;
> > +     unsigned long last_vma_end = 0;
> > +     loff_t pos = 0;
> >       int ret = 0;
> > -     VMA_ITERATOR(vmi, mm, 0);
> > +
> >
>
> Is there now a double-empty line? (it's getting late here)

Oops. Right. Will fix.

>
> >       priv->task = get_proc_task(priv->inode);
> >       if (!priv->task)
> > @@ -1502,89 +1520,55 @@ static int show_smaps_rollup(struct seq_file *m, void *v)
> >               goto out_put_task;
> >       }
> >
> > -     ret = lock_ctx_mm(&priv->lock_ctx);
> > +     hold_task_mempolicy(priv);
> > +     ret = lock_vma_range(m, lock_ctx);
> >       if (ret)
> >               goto out_put_mm;
> >
> > -     hold_task_mempolicy(priv);
> > -     vma = vma_next(&vmi);
> > -
> > -     if (unlikely(!vma))
> > +     vma_iter_init(&priv->iter, mm, 0);
> > +     vma = proc_get_vma(m, &pos);
> > +     if (unlikely(!vma) || vma == get_gate_vma(priv->lock_ctx.mm))
> >               goto empty_set;
> >
> > +     if (IS_ERR(vma)) {
> > +             ret = PTR_ERR(vma);
> > +             goto out_unlock;
> > +     }
> > +
> >       vma_start = vma->vm_start;
> > -     do {
> > -             smap_gather_stats(priv, vma, &mss, 0);
> > +     while (vma) {
> > +             if (IS_ERR(vma)) {
> > +                     ret = PTR_ERR(vma);
> > +                     goto out_unlock;
> > +             }
> > +
> > +             if (vma == get_gate_vma(priv->lock_ctx.mm))
> > +                     break;
> > +
>
> Can you refresh my brain why we now have to check for gate VMAs explicitly?

smap_gather_stats() skips the gate VMA anyway and after the gate we
will be terminating the loop. So, once we see the gate VMA we can exit
the loop. I could teach proc_get_vma() to handle the gate VMA but this
early termination seemed simpler.

>
> > +             /*
> > +              * If after retaking mmap_lock, already reported VMA grew or
> > +              * merged with the next one, then iterate from last_vma_end.
> > +              */
> > +             smap_gather_stats(priv, vma, &mss,
> > +                               vma->vm_start < last_vma_end ? last_vma_end : 0);
> >               last_vma_end = vma->vm_end;
> >
> >               /*
> >                * Release mmap_lock temporarily if someone wants to
> > -              * access it for write request.
> > +              * take it for write request.
> >                */
> > -             if (mmap_lock_is_contended(mm)) {
> > -                     vma_iter_invalidate(&vmi);
> > -                     unlock_ctx_mm(&priv->lock_ctx);
> > -                     ret = lock_ctx_mm(&priv->lock_ctx);
> > -                     if (ret) {
> > -                             release_task_mempolicy(priv);
> > +             if (is_mmap_lock_contended(priv)) {
> > +                     unlock_vma_range(&priv->lock_ctx);
> > +                     ret = lock_vma_range(m, lock_ctx);
> > +                     if (ret)
> >                               goto out_put_mm;
> > -                     }
> > -
> > -                     /*
> > -                      * After dropping the lock, there are four cases to
> > -                      * consider. See the following example for explanation.
> > -                      *
> > -                      *   +------+------+-----------+
> > -                      *   | VMA1 | VMA2 | VMA3      |
> > -                      *   +------+------+-----------+
> > -                      *   |      |      |           |
> > -                      *  4k     8k     16k         400k
> > -                      *
> > -                      * Suppose we drop the lock after reading VMA2 due to
> > -                      * contention, then we get:
> > -                      *
> > -                      *      last_vma_end = 16k
> > -                      *
> > -                      * 1) VMA2 is freed, but VMA3 exists:
> > -                      *
> > -                      *    vma_next(vmi) will return VMA3.
> > -                      *    In this case, just continue from VMA3.
> > -                      *
> > -                      * 2) VMA2 still exists:
> > -                      *
> > -                      *    vma_next(vmi) will return VMA3.
> > -                      *    In this case, just continue from VMA3.
> > -                      *
> > -                      * 3) No more VMAs can be found:
> > -                      *
> > -                      *    vma_next(vmi) will return NULL.
> > -                      *    No more things to do, just break.
> > -                      *
> > -                      * 4) (last_vma_end - 1) is the middle of a vma (VMA'):
> > -                      *
> > -                      *    vma_next(vmi) will return VMA' whose range
> > -                      *    contains last_vma_end.
> > -                      *    Iterate VMA' from last_vma_end.
> > -                      */
> > -                     vma = vma_next(&vmi);
> > -                     /* Case 3 above */
> > -                     if (!vma)
> > -                             break;
> > -
> > -                     /* Case 1 and 2 above */
> > -                     if (vma->vm_start >= last_vma_end) {
> > -                             smap_gather_stats(priv, vma, &mss, 0);
> > -                             last_vma_end = vma->vm_end;
> > -                             continue;
> > -                     }
> >
> > -                     /* Case 4 above */
> > -                     if (vma->vm_end > last_vma_end) {
> > -                             smap_gather_stats(priv, vma, &mss, last_vma_end);
> > -                             last_vma_end = vma->vm_end;
> > -                     }
>
> That's quite the simplification.

True, that's why I decided to remove this comment. I handle this Case
4 in the new code and Cases 1-3 are handled naturally, nothing special
about them.

>
> > +                     /* Resume from the last position. */
> > +                     pos = last_vma_end;
> > +                     vma_iter_init(&priv->iter, mm, pos);
>
> I'll leave checking the VMA locking details to VMA locking experts :)

This is the same pattern we now use for reading maps/smaps/numa_maps.
lock_vma_range() takes rcu_read_lock and proc_get_vma() locks the VMA
under that RCU.

Thanks for the feedback!

>
> --
> Cheers,
>
> David


^ permalink raw reply	[flat|nested] 17+ messages in thread

* Re: [PATCH 1/2] fs/proc/task_mmu: change lock_vma_range() to return error code
  2026-06-06  1:57 [PATCH 1/2] fs/proc/task_mmu: change lock_vma_range() to return error code Suren Baghdasaryan
  2026-06-06  1:57 ` [PATCH 2/2] fs/proc/task_mmu: read proc/pid/smaps_rollup under per-vma lock Suren Baghdasaryan
  2026-06-08 15:38 ` [PATCH 1/2] fs/proc/task_mmu: change lock_vma_range() to return error code David Hildenbrand (Arm)
@ 2026-06-09  8:27 ` Lorenzo Stoakes
  2026-06-09 17:13   ` Suren Baghdasaryan
  2 siblings, 1 reply; 17+ messages in thread
From: Lorenzo Stoakes @ 2026-06-09  8:27 UTC (permalink / raw)
  To: Suren Baghdasaryan
  Cc: akpm, liam, vbabka, david, willy, jannh, paulmck, pfalcato,
	linux-mm, linux-kernel, linux-fsdevel

On Fri, Jun 05, 2026 at 06:57:28PM -0700, Suren Baghdasaryan wrote:
> To correctly propagate error code from lock_vma_range(), change it to
> return the error code instead of the boolean. This simplifies error
> propagation code.
>
> Signed-off-by: Suren Baghdasaryan <surenb@google.com>

Logic LGTM, couple nits below, but:

Reviewed-by: Lorenzo Stoakes <ljs@kernel.org>

> ---
>  fs/proc/task_mmu.c | 16 +++++++++-------
>  1 file changed, 9 insertions(+), 7 deletions(-)
>
> diff --git a/fs/proc/task_mmu.c b/fs/proc/task_mmu.c
> index d32408f7cd5e..023422fcee12 100644
> --- a/fs/proc/task_mmu.c
> +++ b/fs/proc/task_mmu.c
> @@ -162,13 +162,13 @@ static void unlock_ctx_vma(struct proc_maps_locking_ctx *lock_ctx)
>  	}
>  }
>
> -static inline bool lock_vma_range(struct seq_file *m,
> -				  struct proc_maps_locking_ctx *lock_ctx)
> +static inline int lock_vma_range(struct seq_file *m,

Could drop the unnecessary inline here while we're here.

> +				 struct proc_maps_locking_ctx *lock_ctx)
>  {
>  	rcu_read_lock();
>  	reset_lock_ctx(lock_ctx);
>
> -	return true;
> +	return 0;
>  }
>
>  static inline void unlock_vma_range(struct proc_maps_locking_ctx *lock_ctx)
> @@ -245,10 +245,10 @@ static inline void unlock_ctx_mm(struct proc_maps_locking_ctx *lock_ctx)
>  	mmap_read_unlock(lock_ctx->mm);
>  }
>
> -static inline bool lock_vma_range(struct seq_file *m,
> +static inline int lock_vma_range(struct seq_file *m,

Same comment as above.

>  				  struct proc_maps_locking_ctx *lock_ctx)
>  {
> -	return lock_ctx_mm(lock_ctx) == 0;
> +	return lock_ctx_mm(lock_ctx);
>  }
>
>  static inline void unlock_vma_range(struct proc_maps_locking_ctx *lock_ctx)
> @@ -311,6 +311,7 @@ static void *m_start(struct seq_file *m, loff_t *ppos)
>  	struct proc_maps_locking_ctx *lock_ctx;
>  	loff_t last_addr = *ppos;
>  	struct mm_struct *mm;
> +	int err;
>
>  	/* See m_next(). Zero at the start or after lseek. */
>  	if (last_addr == SENTINEL_VMA_END)
> @@ -328,11 +329,12 @@ static void *m_start(struct seq_file *m, loff_t *ppos)
>  		return NULL;
>  	}
>
> -	if (!lock_vma_range(m, lock_ctx)) {
> +	err = lock_vma_range(m, lock_ctx);
> +	if (err) {
>  		mmput(mm);
>  		put_task_struct(priv->task);
>  		priv->task = NULL;
> -		return ERR_PTR(-EINTR);
> +		return ERR_PTR(err);
>  	}
>
>  	/*
>
> base-commit: e178a530a81621a29efbca49b3b78202a18236e4
> --
> 2.54.0.1032.g2f8565e1d1-goog
>

^ permalink raw reply	[flat|nested] 17+ messages in thread

* Re: [PATCH 2/2] fs/proc/task_mmu: read proc/pid/smaps_rollup under per-vma lock
  2026-06-06  1:57 ` [PATCH 2/2] fs/proc/task_mmu: read proc/pid/smaps_rollup under per-vma lock Suren Baghdasaryan
                     ` (2 preceding siblings ...)
  2026-06-08 15:52   ` David Hildenbrand (Arm)
@ 2026-06-09 10:00   ` Lorenzo Stoakes
  2026-06-09 17:11     ` Suren Baghdasaryan
  3 siblings, 1 reply; 17+ messages in thread
From: Lorenzo Stoakes @ 2026-06-09 10:00 UTC (permalink / raw)
  To: Suren Baghdasaryan
  Cc: akpm, liam, vbabka, david, willy, jannh, paulmck, pfalcato,
	linux-mm, linux-kernel, linux-fsdevel

On Fri, Jun 05, 2026 at 06:57:29PM -0700, Suren Baghdasaryan wrote:
> proc/pid/smaps_rollup can be read using the combination of RCU and
> VMA read locks, similar to proc/pid/{maps|smaps|numa_maps}. RCU is
> required to safely traverse the VMA tree and VMA lock stabilizes the
> VMA being processed and the pagetable walk.
> Note that we have to keep the logic to drop mmap_lock on contention
> because even when using per-VMA locks we might have to fall back to
> holding the mmap_lock.
>
> Running Paul's contention benchmark [1] shows considerable improvement
> both in median and in the worst case latencies:
>
> Execution command: run-proc-vs-map.sh --nsamples 20 --rawdata -- \
> --busyduration 2 --procfile smaps_rollup
>
> Baseline:
>
>     0.174     0.161     2.553
>     0.174     0.164     2.663
>     0.174     0.165     2.664
>     0.174     0.166     2.679
>     0.174     0.167     2.691
>     0.174     0.168     2.704
>     0.174     0.169     2.729
>     0.174     0.172     2.741
>     0.174     0.174     2.745
>     0.174     0.174     2.755
>     0.174     0.175     2.790
>     0.174     0.177     2.809
>     0.174     0.179     3.096
>     0.174     0.183     3.144
>     0.174     0.184     3.158
>     0.174     0.185     3.175
>     0.174     0.185     4.568
>     0.174     0.198     4.821
>     0.174     0.214     5.143
>     0.174     0.251     5.220
>
> Patched:
>
>     0.007     0.007     1.952
>     0.007     0.007     1.955
>     0.007     0.007     1.955
>     0.007     0.007     1.955
>     0.007     0.007     1.957
>     0.007     0.007     1.969
>     0.007     0.007     2.065
>     0.007     0.007     2.075
>     0.007     0.007     2.146
>     0.007     0.007     2.195
>     0.007     0.007     2.223
>     0.007     0.007     2.259
>     0.007     0.007     2.488
>     0.007     0.007     2.562
>     0.007     0.007     2.599
>     0.007     0.007     2.697
>     0.007     0.007     3.030
>     0.007     0.007     3.075
>     0.007     0.007     3.145
>     0.007     0.007     3.225

Ohhh lovely!

>
> [1] https://github.com/paulmckrcu/proc-mmap_sem-test
>
> Signed-off-by: Suren Baghdasaryan <surenb@google.com>

Having looked through it carefully the logic looks correct to me, but I
think we can improve how this is implemented, so attach a patch with my
suggestions folded up to make life easier!

> ---
>  fs/proc/task_mmu.c | 134 ++++++++++++++++++++-------------------------
>  1 file changed, 59 insertions(+), 75 deletions(-)
>
> diff --git a/fs/proc/task_mmu.c b/fs/proc/task_mmu.c
> index 023422fcee12..c2bd9f5bbbcd 100644
> --- a/fs/proc/task_mmu.c
> +++ b/fs/proc/task_mmu.c
> @@ -233,6 +233,16 @@ static inline void reacquire_rcu(struct proc_maps_private *priv)
>  	vma_iter_set(&priv->iter, priv->lock_ctx.locked_vma->vm_end);
>  }
>
> +static inline bool is_mmap_lock_contended(struct proc_maps_private *priv)

Hmm seems David and I are saying the same things :) but yeah this name being
very close to mmap_lock_is_contended() is suboptimal.

Also the inline is pointless here and suppresses the compiler unused check which
isn't helpful.

But in general, given we already have the lock context I think we can do away
with this altogether, see below.


> +{
> +	struct proc_maps_locking_ctx *lock_ctx = &priv->lock_ctx;
> +
> +	if (!lock_ctx->mmap_locked)
> +		return false;
> +
> +	return !!mmap_lock_is_contended(lock_ctx->mm);

As David said, !! is not a necessary incantation any more :)

> +}
> +
>  #else /* CONFIG_PER_VMA_LOCK */
>
>  static inline int lock_ctx_mm(struct proc_maps_locking_ctx *lock_ctx)
> @@ -268,6 +278,11 @@ static inline bool fallback_to_mmap_lock(struct proc_maps_private *priv,
>  	return false;
>  }
>
> +static inline bool is_mmap_lock_contended(struct proc_maps_private *priv)
> +{
> +	return !!mmap_lock_is_contended(priv->lock_ctx.mm);

Similarly. But also as above I don't think we need this.

> +}
> +
>  static inline void drop_rcu(struct proc_maps_private *priv) {}
>  static inline void reacquire_rcu(struct proc_maps_private *priv) {}
>
> @@ -1486,12 +1501,15 @@ static int show_smap(struct seq_file *m, void *v)
>  static int show_smaps_rollup(struct seq_file *m, void *v)
>  {
>  	struct proc_maps_private *priv = m->private;
> +	struct proc_maps_locking_ctx *lock_ctx = &priv->lock_ctx;
> +	struct mm_struct *mm = lock_ctx->mm;
>  	struct mem_size_stats mss = {};
> -	struct mm_struct *mm = priv->lock_ctx.mm;
>  	struct vm_area_struct *vma;
> -	unsigned long vma_start = 0, last_vma_end = 0;
> +	unsigned long vma_start = 0;
> +	unsigned long last_vma_end = 0;

I mean I kinda prefer these separate but obviously not necessary :P

> +	loff_t pos = 0;
>  	int ret = 0;
> -	VMA_ITERATOR(vmi, mm, 0);
> +

Yeah extra line as David said :)

>
>  	priv->task = get_proc_task(priv->inode);
>  	if (!priv->task)
> @@ -1502,89 +1520,55 @@ static int show_smaps_rollup(struct seq_file *m, void *v)
>  		goto out_put_task;
>  	}
>
> -	ret = lock_ctx_mm(&priv->lock_ctx);
> +	hold_task_mempolicy(priv);
> +	ret = lock_vma_range(m, lock_ctx);
>  	if (ret)
>  		goto out_put_mm;
>
> -	hold_task_mempolicy(priv);
> -	vma = vma_next(&vmi);
> -
> -	if (unlikely(!vma))
> +	vma_iter_init(&priv->iter, mm, 0);
> +	vma = proc_get_vma(m, &pos);
> +	if (unlikely(!vma) || vma == get_gate_vma(priv->lock_ctx.mm))
>  		goto empty_set;

Is the gate VMA check here necessary? We already check it in the loop.

>
> +	if (IS_ERR(vma)) {
> +		ret = PTR_ERR(vma);
> +		goto out_unlock;
> +	}

Again this is duplicated.

> +
>  	vma_start = vma->vm_start;

Could just drop the gate check, and replace this with:

	vma_start = IS_ERR(vma) ? 0 : vma->vm_start;

> -	do {
> -		smap_gather_stats(priv, vma, &mss, 0);
> +	while (vma) {
> +		if (IS_ERR(vma)) {
> +			ret = PTR_ERR(vma);
> +			goto out_unlock;
> +		}
> +
> +		if (vma == get_gate_vma(priv->lock_ctx.mm))
> +			break;
> +
> +		/*
> +		 * If after retaking mmap_lock, already reported VMA grew or
> +		 * merged with the next one, then iterate from last_vma_end.
> +		 */

Hmm, but if the already reported VMA grew, vma->vm_start would not be <
last_vma_end would it?

So surely this is only covering the merge case?

> +		smap_gather_stats(priv, vma, &mss,
> +				  vma->vm_start < last_vma_end ? last_vma_end : 0);

I don't love how compressed this is.

I also don't love that 0 is taken to be 'start from vma->vm_start' and I
also don't love that the code in smap_gather_stats() actually special cases
this...

How about passing last_vma_end and making smap_gather_stats() more sane? In
the other invocation of smap_gather_stats() we could pass vma->vm_start
here.

We can then move the comment into smap_gather_stats(). E.g.:

		/* Handles the case of VMA merged since mmap locked drop too. */
		smap_gather_stats(priv, vma, &mss, last_vma_end);

To make life easier I put all of my suggestions into a patch which I've
attached :) (though it is untested) let me know what you think.


>  		last_vma_end = vma->vm_end;
>
>  		/*
>  		 * Release mmap_lock temporarily if someone wants to
> -		 * access it for write request.
> +		 * take it for write request.

I think this is better rewritten to reflect the changes. E.g.:

		/*
		 * If the VMA lock is not taken, we hold the often contended
                 * mmap lock. This can be because the arch doesn't support VMA
                 * locks,or we had to fall back to the mmap lock.
		 *
		 * To relieve pressure, check if it is indeed contended, then
                 * temporarily release it.
		 */

>  		 */
> -		if (mmap_lock_is_contended(mm)) {
> -			vma_iter_invalidate(&vmi);
> -			unlock_ctx_mm(&priv->lock_ctx);
> -			ret = lock_ctx_mm(&priv->lock_ctx);
> -			if (ret) {
> -				release_task_mempolicy(priv);
> +		if (is_mmap_lock_contended(priv)) {

We have both mm and lock_ctx already.

So we could instead do:

		if (lock_ctx->mmap_locked && mmap_lock_is_contended(mm)) {
			...

Right?

> +			unlock_vma_range(&priv->lock_ctx);

OK this confuses me - we're gated on lock_ctx->mmap_locked, so this is exactly
the same as unlock_ctx_mm(lock_ctx) right?

There's no situation here where the VMA lock would be unlocked.

Surely it'd therefore be clearer to just call unlock_ctx_mm(lock_ctx)
direct?

> +			ret = lock_vma_range(m, lock_ctx);
> +			if (ret)
>  				goto out_put_mm;


In the case of VMA locks being supported, but we fell back to mmap locks, this
seems to just be getting us back to the invariant that the RCU read lock is
held.

However, it's a bit confusing given we're explicitly checking for the
mmap_locked case, so I think it's worth a comment explaining that we might have
fallen back to mmap lock, but could still get the VMA lock on the next VMA.

E.g.:

	/*
	 * If we are using VMA locks but fell back to an mmap lock, we may
         * be able to VMA lock the next VMA, so reset the lock and try again.
	 *
	 * Otherwise, if the arch doesn't support VMA locks, this simply
         * retakes the mmap lock.
	 */


> -			}
> -
> -			/*
> -			 * After dropping the lock, there are four cases to
> -			 * consider. See the following example for explanation.
> -			 *
> -			 *   +------+------+-----------+
> -			 *   | VMA1 | VMA2 | VMA3      |
> -			 *   +------+------+-----------+
> -			 *   |      |      |           |
> -			 *  4k     8k     16k         400k
> -			 *
> -			 * Suppose we drop the lock after reading VMA2 due to
> -			 * contention, then we get:
> -			 *
> -			 *	last_vma_end = 16k
> -			 *
> -			 * 1) VMA2 is freed, but VMA3 exists:
> -			 *
> -			 *    vma_next(vmi) will return VMA3.
> -			 *    In this case, just continue from VMA3.
> -			 *
> -			 * 2) VMA2 still exists:
> -			 *
> -			 *    vma_next(vmi) will return VMA3.
> -			 *    In this case, just continue from VMA3.
> -			 *
> -			 * 3) No more VMAs can be found:
> -			 *
> -			 *    vma_next(vmi) will return NULL.
> -			 *    No more things to do, just break.
> -			 *
> -			 * 4) (last_vma_end - 1) is the middle of a vma (VMA'):
> -			 *
> -			 *    vma_next(vmi) will return VMA' whose range
> -			 *    contains last_vma_end.
> -			 *    Iterate VMA' from last_vma_end.
> -			 */
> -			vma = vma_next(&vmi);
> -			/* Case 3 above */
> -			if (!vma)
> -				break;
> -
> -			/* Case 1 and 2 above */
> -			if (vma->vm_start >= last_vma_end) {
> -				smap_gather_stats(priv, vma, &mss, 0);
> -				last_vma_end = vma->vm_end;
> -				continue;
> -			}
>
> -			/* Case 4 above */
> -			if (vma->vm_end > last_vma_end) {
> -				smap_gather_stats(priv, vma, &mss, last_vma_end);
> -				last_vma_end = vma->vm_end;
> -			}

OK so you're rolling all this up into the 'if after retaking mmap_lock'
bit.

I don't love that we're losing this information, but also it's maybe a
little more than we need here and it's making

But perhaps could we transfer this into the commit message as a
later-findable justification?

> +			/* Resume from the last position. */
> +			pos = last_vma_end;
> +			vma_iter_init(&priv->iter, mm, pos);
>  		}
> -	} for_each_vma(vmi, vma);
> +		vma = proc_get_vma(m, &pos);
> +	}
>
>  empty_set:
>  	show_vma_header_prefix(m, vma_start, last_vma_end, 0, 0, 0, 0);
> @@ -1593,10 +1577,10 @@ static int show_smaps_rollup(struct seq_file *m, void *v)
>
>  	__show_smap(m, &mss, true);
>
> -	release_task_mempolicy(priv);
> -	unlock_ctx_mm(&priv->lock_ctx);
> -
> +out_unlock:
> +	unlock_vma_range(&priv->lock_ctx);
>  out_put_mm:
> +	release_task_mempolicy(priv);

Previously this was done under the mmap lock, now it's not. I don't think
this should be an issue but just highlighting.

>  	mmput(mm);
>  out_put_task:
>  	put_task_struct(priv->task);
> --
> 2.54.0.1032.g2f8565e1d1-goog
>

Cheers, Lorenzo

----8<----
From d94e7a192242f2cefb10bbfa12495e46c3d4c973 Mon Sep 17 00:00:00 2001
From: Lorenzo Stoakes <ljs@kernel.org>
Date: Tue, 9 Jun 2026 10:39:39 +0100
Subject: [PATCH] ideas

---
 fs/proc/task_mmu.c | 89 ++++++++++++++++++++++------------------------
 1 file changed, 42 insertions(+), 47 deletions(-)

diff --git a/fs/proc/task_mmu.c b/fs/proc/task_mmu.c
index c2bd9f5bbbcd..16bf3cd8c7c7 100644
--- a/fs/proc/task_mmu.c
+++ b/fs/proc/task_mmu.c
@@ -233,16 +233,6 @@ static inline void reacquire_rcu(struct proc_maps_private *priv)
 	vma_iter_set(&priv->iter, priv->lock_ctx.locked_vma->vm_end);
 }

-static inline bool is_mmap_lock_contended(struct proc_maps_private *priv)
-{
-	struct proc_maps_locking_ctx *lock_ctx = &priv->lock_ctx;
-
-	if (!lock_ctx->mmap_locked)
-		return false;
-
-	return !!mmap_lock_is_contended(lock_ctx->mm);
-}
-
 #else /* CONFIG_PER_VMA_LOCK */

 static inline int lock_ctx_mm(struct proc_maps_locking_ctx *lock_ctx)
@@ -278,11 +268,6 @@ static inline bool fallback_to_mmap_lock(struct proc_maps_private *priv,
 	return false;
 }

-static inline bool is_mmap_lock_contended(struct proc_maps_private *priv)
-{
-	return !!mmap_lock_is_contended(priv->lock_ctx.mm);
-}
-
 static inline void drop_rcu(struct proc_maps_private *priv) {}
 static inline void reacquire_rcu(struct proc_maps_private *priv) {}

@@ -1375,17 +1360,24 @@ get_smaps_shmem_walk_ops(struct proc_maps_private *priv)

 #endif /* CONFIG_PER_VMA_LOCK */

-/*
- * Gather mem stats from @vma with the indicated beginning
- * address @start, and keep them in @mss.
+/**
+ * smap_gather_stats() - Gather mem stats from @vma.
+ * @priv: proc maps private state.
+ * @vma: The VMA whoms stats we wish to gather.
+ * @mss: The accumulated stats.
+ * @start: The address from which to start.
  *
- * Use vm_start of @vma as the beginning address if @start is 0.
+ * This gathers stats for the whole of the VMA unless the mmap lock was dropped
+ * and we raced a VMA merge, in which case we only gather stats for the
+ * remainder of the merged range.
  */
 static void smap_gather_stats(struct proc_maps_private *priv,
 			      struct vm_area_struct *vma,
-			      struct mem_size_stats *mss, unsigned long start)
+			      struct mem_size_stats *mss,
+			      unsigned long start)
 {
 	const struct mm_walk_ops *ops = get_smaps_walk_ops(priv);
+	const bool is_partial = start > vma->vm_start;

 	/* Invalid start */
 	if (start >= vma->vm_end)
@@ -1408,20 +1400,20 @@ static void smap_gather_stats(struct proc_maps_private *priv,
 		 * Unless we know that the shmem object (or the part mapped by
 		 * our VMA) has no swapped out pages at all.
 		 */
-		unsigned long shmem_swapped = shmem_swap_usage(vma);
+		const unsigned long shmem_swapped = shmem_swap_usage(vma);
+		const bool shared_or_ro = vma_test(vma, VMA_SHARED_BIT) ||
+			!vma_test(vma, VMA_WRITE_BIT);

-		if (!start && (!shmem_swapped || (vma->vm_flags & VM_SHARED) ||
-					!(vma->vm_flags & VM_WRITE))) {
+		if (!is_partial && (!shmem_swapped || shared_or_ro))
 			mss->swap += shmem_swapped;
-		} else {
+		else
 			ops = get_smaps_shmem_walk_ops(priv);
-		}
 	}

-	if (!start)
-		walk_page_vma(vma, ops, mss);
-	else
+	if (is_partial)
 		walk_page_range(vma->vm_mm, start, vma->vm_end, ops, mss);
+	else
+		walk_page_vma(vma, ops, mss);

 	reacquire_rcu(priv);
 }
@@ -1476,7 +1468,7 @@ static int show_smap(struct seq_file *m, void *v)
 	struct vm_area_struct *vma = v;
 	struct mem_size_stats mss = {};

-	smap_gather_stats(priv, vma, &mss, 0);
+	smap_gather_stats(priv, vma, &mss, vma->vm_start);

 	show_map_vma(m, vma);

@@ -1510,7 +1502,6 @@ static int show_smaps_rollup(struct seq_file *m, void *v)
 	loff_t pos = 0;
 	int ret = 0;

-
 	priv->task = get_proc_task(priv->inode);
 	if (!priv->task)
 		return -ESRCH;
@@ -1527,15 +1518,10 @@ static int show_smaps_rollup(struct seq_file *m, void *v)

 	vma_iter_init(&priv->iter, mm, 0);
 	vma = proc_get_vma(m, &pos);
-	if (unlikely(!vma) || vma == get_gate_vma(priv->lock_ctx.mm))
+	if (unlikely(!vma))
 		goto empty_set;

-	if (IS_ERR(vma)) {
-		ret = PTR_ERR(vma);
-		goto out_unlock;
-	}
-
-	vma_start = vma->vm_start;
+	vma_start = IS_ERR(vma) ? 0 : vma->vm_start;
 	while (vma) {
 		if (IS_ERR(vma)) {
 			ret = PTR_ERR(vma);
@@ -1545,20 +1531,29 @@ static int show_smaps_rollup(struct seq_file *m, void *v)
 		if (vma == get_gate_vma(priv->lock_ctx.mm))
 			break;

-		/*
-		 * If after retaking mmap_lock, already reported VMA grew or
-		 * merged with the next one, then iterate from last_vma_end.
-		 */
-		smap_gather_stats(priv, vma, &mss,
-				  vma->vm_start < last_vma_end ? last_vma_end : 0);
+		/* Handles the case of VMA merged since mmap locked drop too. */
+		smap_gather_stats(priv, vma, &mss, last_vma_end);
 		last_vma_end = vma->vm_end;

 		/*
-		 * Release mmap_lock temporarily if someone wants to
-		 * take it for write request.
+		 * If the VMA lock is not taken, we hold the often contended
+                 * mmap lock. This can be because the arch doesn't support VMA
+                 * locks,or we had to fall back to the mmap lock.
+		 *
+		 * To relieve pressure, check if it is indeed contended, then
+                 * temporarily release it.
 		 */
-		if (is_mmap_lock_contended(priv)) {
-			unlock_vma_range(&priv->lock_ctx);
+		if (lock_ctx->mmap_locked && mmap_lock_is_contended(mm)) {
+			unlock_ctx_mm(lock_ctx);
+
+			/*
+			 * If we are using VMA locks but fell back to an mmap
+			 * lock, we may be able to VMA lock the next VMA, so
+			 * reset the lock and try again.
+			 *
+			 * Otherwise, if the arch doesn't support VMA locks,
+			 * this simply retakes the mmap lock.
+			 */
 			ret = lock_vma_range(m, lock_ctx);
 			if (ret)
 				goto out_put_mm;
--
2.54.0

^ permalink raw reply related	[flat|nested] 17+ messages in thread

* Re: [PATCH 2/2] fs/proc/task_mmu: read proc/pid/smaps_rollup under per-vma lock
  2026-06-09 10:00   ` Lorenzo Stoakes
@ 2026-06-09 17:11     ` Suren Baghdasaryan
  0 siblings, 0 replies; 17+ messages in thread
From: Suren Baghdasaryan @ 2026-06-09 17:11 UTC (permalink / raw)
  To: Lorenzo Stoakes
  Cc: akpm, liam, vbabka, david, willy, jannh, paulmck, pfalcato,
	linux-mm, linux-kernel, linux-fsdevel, Dave Hansen

On Tue, Jun 9, 2026 at 3:00 AM Lorenzo Stoakes <ljs@kernel.org> wrote:
>
> On Fri, Jun 05, 2026 at 06:57:29PM -0700, Suren Baghdasaryan wrote:
> > proc/pid/smaps_rollup can be read using the combination of RCU and
> > VMA read locks, similar to proc/pid/{maps|smaps|numa_maps}. RCU is
> > required to safely traverse the VMA tree and VMA lock stabilizes the
> > VMA being processed and the pagetable walk.
> > Note that we have to keep the logic to drop mmap_lock on contention
> > because even when using per-VMA locks we might have to fall back to
> > holding the mmap_lock.
> >
> > Running Paul's contention benchmark [1] shows considerable improvement
> > both in median and in the worst case latencies:
> >
> > Execution command: run-proc-vs-map.sh --nsamples 20 --rawdata -- \
> > --busyduration 2 --procfile smaps_rollup
> >
> > Baseline:
> >
> >     0.174     0.161     2.553
> >     0.174     0.164     2.663
> >     0.174     0.165     2.664
> >     0.174     0.166     2.679
> >     0.174     0.167     2.691
> >     0.174     0.168     2.704
> >     0.174     0.169     2.729
> >     0.174     0.172     2.741
> >     0.174     0.174     2.745
> >     0.174     0.174     2.755
> >     0.174     0.175     2.790
> >     0.174     0.177     2.809
> >     0.174     0.179     3.096
> >     0.174     0.183     3.144
> >     0.174     0.184     3.158
> >     0.174     0.185     3.175
> >     0.174     0.185     4.568
> >     0.174     0.198     4.821
> >     0.174     0.214     5.143
> >     0.174     0.251     5.220
> >
> > Patched:
> >
> >     0.007     0.007     1.952
> >     0.007     0.007     1.955
> >     0.007     0.007     1.955
> >     0.007     0.007     1.955
> >     0.007     0.007     1.957
> >     0.007     0.007     1.969
> >     0.007     0.007     2.065
> >     0.007     0.007     2.075
> >     0.007     0.007     2.146
> >     0.007     0.007     2.195
> >     0.007     0.007     2.223
> >     0.007     0.007     2.259
> >     0.007     0.007     2.488
> >     0.007     0.007     2.562
> >     0.007     0.007     2.599
> >     0.007     0.007     2.697
> >     0.007     0.007     3.030
> >     0.007     0.007     3.075
> >     0.007     0.007     3.145
> >     0.007     0.007     3.225
>
> Ohhh lovely!
>
> >
> > [1] https://github.com/paulmckrcu/proc-mmap_sem-test
> >
> > Signed-off-by: Suren Baghdasaryan <surenb@google.com>
>
> Having looked through it carefully the logic looks correct to me, but I
> think we can improve how this is implemented, so attach a patch with my
> suggestions folded up to make life easier!
>
> > ---
> >  fs/proc/task_mmu.c | 134 ++++++++++++++++++++-------------------------
> >  1 file changed, 59 insertions(+), 75 deletions(-)
> >
> > diff --git a/fs/proc/task_mmu.c b/fs/proc/task_mmu.c
> > index 023422fcee12..c2bd9f5bbbcd 100644
> > --- a/fs/proc/task_mmu.c
> > +++ b/fs/proc/task_mmu.c
> > @@ -233,6 +233,16 @@ static inline void reacquire_rcu(struct proc_maps_private *priv)
> >       vma_iter_set(&priv->iter, priv->lock_ctx.locked_vma->vm_end);
> >  }
> >
> > +static inline bool is_mmap_lock_contended(struct proc_maps_private *priv)
>
> Hmm seems David and I are saying the same things :) but yeah this name being
> very close to mmap_lock_is_contended() is suboptimal.
>
> Also the inline is pointless here and suppresses the compiler unused check which
> isn't helpful.

Ack.

>
> But in general, given we already have the lock context I think we can do away
> with this altogether, see below.
>
>
> > +{
> > +     struct proc_maps_locking_ctx *lock_ctx = &priv->lock_ctx;
> > +
> > +     if (!lock_ctx->mmap_locked)
> > +             return false;
> > +
> > +     return !!mmap_lock_is_contended(lock_ctx->mm);
>
> As David said, !! is not a necessary incantation any more :)

Ack.

>
> > +}
> > +
> >  #else /* CONFIG_PER_VMA_LOCK */
> >
> >  static inline int lock_ctx_mm(struct proc_maps_locking_ctx *lock_ctx)
> > @@ -268,6 +278,11 @@ static inline bool fallback_to_mmap_lock(struct proc_maps_private *priv,
> >       return false;
> >  }
> >
> > +static inline bool is_mmap_lock_contended(struct proc_maps_private *priv)
> > +{
> > +     return !!mmap_lock_is_contended(priv->lock_ctx.mm);
>
> Similarly. But also as above I don't think we need this.

Sadly, for now we do (see my later comment).

>
> > +}
> > +
> >  static inline void drop_rcu(struct proc_maps_private *priv) {}
> >  static inline void reacquire_rcu(struct proc_maps_private *priv) {}
> >
> > @@ -1486,12 +1501,15 @@ static int show_smap(struct seq_file *m, void *v)
> >  static int show_smaps_rollup(struct seq_file *m, void *v)
> >  {
> >       struct proc_maps_private *priv = m->private;
> > +     struct proc_maps_locking_ctx *lock_ctx = &priv->lock_ctx;
> > +     struct mm_struct *mm = lock_ctx->mm;
> >       struct mem_size_stats mss = {};
> > -     struct mm_struct *mm = priv->lock_ctx.mm;
> >       struct vm_area_struct *vma;
> > -     unsigned long vma_start = 0, last_vma_end = 0;
> > +     unsigned long vma_start = 0;
> > +     unsigned long last_vma_end = 0;
>
> I mean I kinda prefer these separate but obviously not necessary :P

It looked nicer I think :)

>
> > +     loff_t pos = 0;
> >       int ret = 0;
> > -     VMA_ITERATOR(vmi, mm, 0);
> > +
>
> Yeah extra line as David said :)

Ack.

>
> >
> >       priv->task = get_proc_task(priv->inode);
> >       if (!priv->task)
> > @@ -1502,89 +1520,55 @@ static int show_smaps_rollup(struct seq_file *m, void *v)
> >               goto out_put_task;
> >       }
> >
> > -     ret = lock_ctx_mm(&priv->lock_ctx);
> > +     hold_task_mempolicy(priv);
> > +     ret = lock_vma_range(m, lock_ctx);
> >       if (ret)
> >               goto out_put_mm;
> >
> > -     hold_task_mempolicy(priv);
> > -     vma = vma_next(&vmi);
> > -
> > -     if (unlikely(!vma))
> > +     vma_iter_init(&priv->iter, mm, 0);
> > +     vma = proc_get_vma(m, &pos);
> > +     if (unlikely(!vma) || vma == get_gate_vma(priv->lock_ctx.mm))
> >               goto empty_set;
>
> Is the gate VMA check here necessary? We already check it in the loop.

We need it to avoid resetting vma_start, which is being reported even
when no normal VMAs are present.

>
> >
> > +     if (IS_ERR(vma)) {
> > +             ret = PTR_ERR(vma);
> > +             goto out_unlock;
> > +     }
>
> Again this is duplicated.
>
> > +
> >       vma_start = vma->vm_start;
>
> Could just drop the gate check, and replace this with:
>
>         vma_start = IS_ERR(vma) ? 0 : vma->vm_start;

Yeah, that would work. I don't have a strong preference but avoiding
the loop on error seemed cleaner to me. Will change.

>
> > -     do {
> > -             smap_gather_stats(priv, vma, &mss, 0);
> > +     while (vma) {
> > +             if (IS_ERR(vma)) {
> > +                     ret = PTR_ERR(vma);
> > +                     goto out_unlock;
> > +             }
> > +
> > +             if (vma == get_gate_vma(priv->lock_ctx.mm))
> > +                     break;
> > +
> > +             /*
> > +              * If after retaking mmap_lock, already reported VMA grew or
> > +              * merged with the next one, then iterate from last_vma_end.
> > +              */
>
> Hmm, but if the already reported VMA grew, vma->vm_start would not be <
> last_vma_end would it?

The scenario is: we reported the VMA before it grew, we set
last_vma_end to its vm_end then dropped the lock. Later after VMA grew
we searched past the last_vma_end and found the same VMA because now
it spans past last_vma_end. In this case vma->vm_start will be less
than last_vma_end and I think this would happen either if the VMA grew
or got merged with its upper neighbor.

>
> So surely this is only covering the merge case?
>
> > +             smap_gather_stats(priv, vma, &mss,
> > +                               vma->vm_start < last_vma_end ? last_vma_end : 0);
>
> I don't love how compressed this is.
>
> I also don't love that 0 is taken to be 'start from vma->vm_start' and I
> also don't love that the code in smap_gather_stats() actually special cases
> this...
>
> How about passing last_vma_end and making smap_gather_stats() more sane? In
> the other invocation of smap_gather_stats() we could pass vma->vm_start
> here.
>
> We can then move the comment into smap_gather_stats(). E.g.:
>
>                 /* Handles the case of VMA merged since mmap locked drop too. */
>                 smap_gather_stats(priv, vma, &mss, last_vma_end);
>
> To make life easier I put all of my suggestions into a patch which I've
> attached :) (though it is untested) let me know what you think.

I was trying to minimize changes to the current logic but that would
be cleaner. I think it would be better to make this logical change as
a separate prerequisite patch. If you agree then I'll do that in my
next version.

>
>
> >               last_vma_end = vma->vm_end;
> >
> >               /*
> >                * Release mmap_lock temporarily if someone wants to
> > -              * access it for write request.
> > +              * take it for write request.
>
> I think this is better rewritten to reflect the changes. E.g.:
>
>                 /*
>                  * If the VMA lock is not taken, we hold the often contended
>                  * mmap lock. This can be because the arch doesn't support VMA
>                  * locks,or we had to fall back to the mmap lock.

I intend to postpone this patchset until after Dave's patchset lands
into mm-unstable, so this "arch doesn't support VMA locks" part will
not be there, but othereise SGTM.

>                  *
>                  * To relieve pressure, check if it is indeed contended, then
>                  * temporarily release it.
>                  */
>
> >                */
> > -             if (mmap_lock_is_contended(mm)) {
> > -                     vma_iter_invalidate(&vmi);
> > -                     unlock_ctx_mm(&priv->lock_ctx);
> > -                     ret = lock_ctx_mm(&priv->lock_ctx);
> > -                     if (ret) {
> > -                             release_task_mempolicy(priv);
> > +             if (is_mmap_lock_contended(priv)) {
>
> We have both mm and lock_ctx already.
>
> So we could instead do:
>
>                 if (lock_ctx->mmap_locked && mmap_lock_is_contended(mm)) {
>                         ...
>
> Right?

Unfortunately not. If you look at the struct proc_maps_locking_ctx
definition, mmap_locked will not be there if CONFIG_PER_VMA_LOCK=n.
That's why I have to introduce versions of is_mmap_lock_contended()
for both CONFIG_PER_VMA_LOCK and !CONFIG_PER_VMA_LOCK configs. And
that's why I'm leaning towards waiting for Dave's cleanup to be posted
which will eliminate CONFIG_PER_VMA_LOCK. It will clean up many other
similar helpers in this file.

I just realized I did not CC Dave to this series before, so adding him now.

>
> > +                     unlock_vma_range(&priv->lock_ctx);
>
> OK this confuses me - we're gated on lock_ctx->mmap_locked, so this is exactly
> the same as unlock_ctx_mm(lock_ctx) right?
>
> There's no situation here where the VMA lock would be unlocked.
>
> Surely it'd therefore be clearer to just call unlock_ctx_mm(lock_ctx)
> direct?

Technically yes but one thing I forgot to do here is to remove the
lock_ctx_mm()/unlock_ctx_mm() helpers because after this change there
are no direct users for them.

>
> > +                     ret = lock_vma_range(m, lock_ctx);
> > +                     if (ret)
> >                               goto out_put_mm;
>
>
> In the case of VMA locks being supported, but we fell back to mmap locks, this
> seems to just be getting us back to the invariant that the RCU read lock is
> held.
>
> However, it's a bit confusing given we're explicitly checking for the
> mmap_locked case, so I think it's worth a comment explaining that we might have
> fallen back to mmap lock, but could still get the VMA lock on the next VMA.
>
> E.g.:
>
>         /*
>          * If we are using VMA locks but fell back to an mmap lock, we may
>          * be able to VMA lock the next VMA, so reset the lock and try again.
>          *
>          * Otherwise, if the arch doesn't support VMA locks, this simply
>          * retakes the mmap lock.
>          */

Yeah, I can see now that this case is quite confusing and needs more
comments. Will do in the next rev.

>
>
> > -                     }
> > -
> > -                     /*
> > -                      * After dropping the lock, there are four cases to
> > -                      * consider. See the following example for explanation.
> > -                      *
> > -                      *   +------+------+-----------+
> > -                      *   | VMA1 | VMA2 | VMA3      |
> > -                      *   +------+------+-----------+
> > -                      *   |      |      |           |
> > -                      *  4k     8k     16k         400k
> > -                      *
> > -                      * Suppose we drop the lock after reading VMA2 due to
> > -                      * contention, then we get:
> > -                      *
> > -                      *      last_vma_end = 16k
> > -                      *
> > -                      * 1) VMA2 is freed, but VMA3 exists:
> > -                      *
> > -                      *    vma_next(vmi) will return VMA3.
> > -                      *    In this case, just continue from VMA3.
> > -                      *
> > -                      * 2) VMA2 still exists:
> > -                      *
> > -                      *    vma_next(vmi) will return VMA3.
> > -                      *    In this case, just continue from VMA3.
> > -                      *
> > -                      * 3) No more VMAs can be found:
> > -                      *
> > -                      *    vma_next(vmi) will return NULL.
> > -                      *    No more things to do, just break.
> > -                      *
> > -                      * 4) (last_vma_end - 1) is the middle of a vma (VMA'):
> > -                      *
> > -                      *    vma_next(vmi) will return VMA' whose range
> > -                      *    contains last_vma_end.
> > -                      *    Iterate VMA' from last_vma_end.
> > -                      */
> > -                     vma = vma_next(&vmi);
> > -                     /* Case 3 above */
> > -                     if (!vma)
> > -                             break;
> > -
> > -                     /* Case 1 and 2 above */
> > -                     if (vma->vm_start >= last_vma_end) {
> > -                             smap_gather_stats(priv, vma, &mss, 0);
> > -                             last_vma_end = vma->vm_end;
> > -                             continue;
> > -                     }
> >
> > -                     /* Case 4 above */
> > -                     if (vma->vm_end > last_vma_end) {
> > -                             smap_gather_stats(priv, vma, &mss, last_vma_end);
> > -                             last_vma_end = vma->vm_end;
> > -                     }
>
> OK so you're rolling all this up into the 'if after retaking mmap_lock'
> bit.

Yep, because Cases 1-3 and quite natural. Case 4 is the only "special"
case here IMO, so we handle it separately with separate comments
added.

>
> I don't love that we're losing this information, but also it's maybe a
> little more than we need here and it's making
>
> But perhaps could we transfer this into the commit message as a
> later-findable justification?

Sure, that I can do.

>
> > +                     /* Resume from the last position. */
> > +                     pos = last_vma_end;
> > +                     vma_iter_init(&priv->iter, mm, pos);
> >               }
> > -     } for_each_vma(vmi, vma);
> > +             vma = proc_get_vma(m, &pos);
> > +     }
> >
> >  empty_set:
> >       show_vma_header_prefix(m, vma_start, last_vma_end, 0, 0, 0, 0);
> > @@ -1593,10 +1577,10 @@ static int show_smaps_rollup(struct seq_file *m, void *v)
> >
> >       __show_smap(m, &mss, true);
> >
> > -     release_task_mempolicy(priv);
> > -     unlock_ctx_mm(&priv->lock_ctx);
> > -
> > +out_unlock:
> > +     unlock_vma_range(&priv->lock_ctx);
> >  out_put_mm:
> > +     release_task_mempolicy(priv);
>
> Previously this was done under the mmap lock, now it's not. I don't think
> this should be an issue but just highlighting.

Yeah, I looked into that but found no evidence that it's a problem. If
anyone knows otherwise please let me know.

Thanks for the review, Lorenzo! Much appreciated.

>
> >       mmput(mm);
> >  out_put_task:
> >       put_task_struct(priv->task);
> > --
> > 2.54.0.1032.g2f8565e1d1-goog
> >
>
> Cheers, Lorenzo
>
> ----8<----
> From d94e7a192242f2cefb10bbfa12495e46c3d4c973 Mon Sep 17 00:00:00 2001
> From: Lorenzo Stoakes <ljs@kernel.org>
> Date: Tue, 9 Jun 2026 10:39:39 +0100
> Subject: [PATCH] ideas
>
> ---
>  fs/proc/task_mmu.c | 89 ++++++++++++++++++++++------------------------
>  1 file changed, 42 insertions(+), 47 deletions(-)
>
> diff --git a/fs/proc/task_mmu.c b/fs/proc/task_mmu.c
> index c2bd9f5bbbcd..16bf3cd8c7c7 100644
> --- a/fs/proc/task_mmu.c
> +++ b/fs/proc/task_mmu.c
> @@ -233,16 +233,6 @@ static inline void reacquire_rcu(struct proc_maps_private *priv)
>         vma_iter_set(&priv->iter, priv->lock_ctx.locked_vma->vm_end);
>  }
>
> -static inline bool is_mmap_lock_contended(struct proc_maps_private *priv)
> -{
> -       struct proc_maps_locking_ctx *lock_ctx = &priv->lock_ctx;
> -
> -       if (!lock_ctx->mmap_locked)
> -               return false;
> -
> -       return !!mmap_lock_is_contended(lock_ctx->mm);
> -}
> -
>  #else /* CONFIG_PER_VMA_LOCK */
>
>  static inline int lock_ctx_mm(struct proc_maps_locking_ctx *lock_ctx)
> @@ -278,11 +268,6 @@ static inline bool fallback_to_mmap_lock(struct proc_maps_private *priv,
>         return false;
>  }
>
> -static inline bool is_mmap_lock_contended(struct proc_maps_private *priv)
> -{
> -       return !!mmap_lock_is_contended(priv->lock_ctx.mm);
> -}
> -
>  static inline void drop_rcu(struct proc_maps_private *priv) {}
>  static inline void reacquire_rcu(struct proc_maps_private *priv) {}
>
> @@ -1375,17 +1360,24 @@ get_smaps_shmem_walk_ops(struct proc_maps_private *priv)
>
>  #endif /* CONFIG_PER_VMA_LOCK */
>
> -/*
> - * Gather mem stats from @vma with the indicated beginning
> - * address @start, and keep them in @mss.
> +/**
> + * smap_gather_stats() - Gather mem stats from @vma.
> + * @priv: proc maps private state.
> + * @vma: The VMA whoms stats we wish to gather.
> + * @mss: The accumulated stats.
> + * @start: The address from which to start.
>   *
> - * Use vm_start of @vma as the beginning address if @start is 0.
> + * This gathers stats for the whole of the VMA unless the mmap lock was dropped
> + * and we raced a VMA merge, in which case we only gather stats for the
> + * remainder of the merged range.
>   */
>  static void smap_gather_stats(struct proc_maps_private *priv,
>                               struct vm_area_struct *vma,
> -                             struct mem_size_stats *mss, unsigned long start)
> +                             struct mem_size_stats *mss,
> +                             unsigned long start)
>  {
>         const struct mm_walk_ops *ops = get_smaps_walk_ops(priv);
> +       const bool is_partial = start > vma->vm_start;
>
>         /* Invalid start */
>         if (start >= vma->vm_end)
> @@ -1408,20 +1400,20 @@ static void smap_gather_stats(struct proc_maps_private *priv,
>                  * Unless we know that the shmem object (or the part mapped by
>                  * our VMA) has no swapped out pages at all.
>                  */
> -               unsigned long shmem_swapped = shmem_swap_usage(vma);
> +               const unsigned long shmem_swapped = shmem_swap_usage(vma);
> +               const bool shared_or_ro = vma_test(vma, VMA_SHARED_BIT) ||
> +                       !vma_test(vma, VMA_WRITE_BIT);
>
> -               if (!start && (!shmem_swapped || (vma->vm_flags & VM_SHARED) ||
> -                                       !(vma->vm_flags & VM_WRITE))) {
> +               if (!is_partial && (!shmem_swapped || shared_or_ro))
>                         mss->swap += shmem_swapped;
> -               } else {
> +               else
>                         ops = get_smaps_shmem_walk_ops(priv);
> -               }
>         }
>
> -       if (!start)
> -               walk_page_vma(vma, ops, mss);
> -       else
> +       if (is_partial)
>                 walk_page_range(vma->vm_mm, start, vma->vm_end, ops, mss);
> +       else
> +               walk_page_vma(vma, ops, mss);
>
>         reacquire_rcu(priv);
>  }
> @@ -1476,7 +1468,7 @@ static int show_smap(struct seq_file *m, void *v)
>         struct vm_area_struct *vma = v;
>         struct mem_size_stats mss = {};
>
> -       smap_gather_stats(priv, vma, &mss, 0);
> +       smap_gather_stats(priv, vma, &mss, vma->vm_start);
>
>         show_map_vma(m, vma);
>
> @@ -1510,7 +1502,6 @@ static int show_smaps_rollup(struct seq_file *m, void *v)
>         loff_t pos = 0;
>         int ret = 0;
>
> -
>         priv->task = get_proc_task(priv->inode);
>         if (!priv->task)
>                 return -ESRCH;
> @@ -1527,15 +1518,10 @@ static int show_smaps_rollup(struct seq_file *m, void *v)
>
>         vma_iter_init(&priv->iter, mm, 0);
>         vma = proc_get_vma(m, &pos);
> -       if (unlikely(!vma) || vma == get_gate_vma(priv->lock_ctx.mm))
> +       if (unlikely(!vma))
>                 goto empty_set;
>
> -       if (IS_ERR(vma)) {
> -               ret = PTR_ERR(vma);
> -               goto out_unlock;
> -       }
> -
> -       vma_start = vma->vm_start;
> +       vma_start = IS_ERR(vma) ? 0 : vma->vm_start;
>         while (vma) {
>                 if (IS_ERR(vma)) {
>                         ret = PTR_ERR(vma);
> @@ -1545,20 +1531,29 @@ static int show_smaps_rollup(struct seq_file *m, void *v)
>                 if (vma == get_gate_vma(priv->lock_ctx.mm))
>                         break;
>
> -               /*
> -                * If after retaking mmap_lock, already reported VMA grew or
> -                * merged with the next one, then iterate from last_vma_end.
> -                */
> -               smap_gather_stats(priv, vma, &mss,
> -                                 vma->vm_start < last_vma_end ? last_vma_end : 0);
> +               /* Handles the case of VMA merged since mmap locked drop too. */
> +               smap_gather_stats(priv, vma, &mss, last_vma_end);
>                 last_vma_end = vma->vm_end;
>
>                 /*
> -                * Release mmap_lock temporarily if someone wants to
> -                * take it for write request.
> +                * If the VMA lock is not taken, we hold the often contended
> +                 * mmap lock. This can be because the arch doesn't support VMA
> +                 * locks,or we had to fall back to the mmap lock.
> +                *
> +                * To relieve pressure, check if it is indeed contended, then
> +                 * temporarily release it.
>                  */
> -               if (is_mmap_lock_contended(priv)) {
> -                       unlock_vma_range(&priv->lock_ctx);
> +               if (lock_ctx->mmap_locked && mmap_lock_is_contended(mm)) {
> +                       unlock_ctx_mm(lock_ctx);
> +
> +                       /*
> +                        * If we are using VMA locks but fell back to an mmap
> +                        * lock, we may be able to VMA lock the next VMA, so
> +                        * reset the lock and try again.
> +                        *
> +                        * Otherwise, if the arch doesn't support VMA locks,
> +                        * this simply retakes the mmap lock.
> +                        */
>                         ret = lock_vma_range(m, lock_ctx);
>                         if (ret)
>                                 goto out_put_mm;
> --
> 2.54.0


^ permalink raw reply	[flat|nested] 17+ messages in thread

* Re: [PATCH 1/2] fs/proc/task_mmu: change lock_vma_range() to return error code
  2026-06-09  8:27 ` Lorenzo Stoakes
@ 2026-06-09 17:13   ` Suren Baghdasaryan
  0 siblings, 0 replies; 17+ messages in thread
From: Suren Baghdasaryan @ 2026-06-09 17:13 UTC (permalink / raw)
  To: Lorenzo Stoakes
  Cc: akpm, liam, vbabka, david, willy, jannh, paulmck, pfalcato,
	linux-mm, linux-kernel, linux-fsdevel

On Tue, Jun 9, 2026 at 1:28 AM Lorenzo Stoakes <ljs@kernel.org> wrote:
>
> On Fri, Jun 05, 2026 at 06:57:28PM -0700, Suren Baghdasaryan wrote:
> > To correctly propagate error code from lock_vma_range(), change it to
> > return the error code instead of the boolean. This simplifies error
> > propagation code.
> >
> > Signed-off-by: Suren Baghdasaryan <surenb@google.com>
>
> Logic LGTM, couple nits below, but:
>
> Reviewed-by: Lorenzo Stoakes <ljs@kernel.org>

Thanks!

>
> > ---
> >  fs/proc/task_mmu.c | 16 +++++++++-------
> >  1 file changed, 9 insertions(+), 7 deletions(-)
> >
> > diff --git a/fs/proc/task_mmu.c b/fs/proc/task_mmu.c
> > index d32408f7cd5e..023422fcee12 100644
> > --- a/fs/proc/task_mmu.c
> > +++ b/fs/proc/task_mmu.c
> > @@ -162,13 +162,13 @@ static void unlock_ctx_vma(struct proc_maps_locking_ctx *lock_ctx)
> >       }
> >  }
> >
> > -static inline bool lock_vma_range(struct seq_file *m,
> > -                               struct proc_maps_locking_ctx *lock_ctx)
> > +static inline int lock_vma_range(struct seq_file *m,
>
> Could drop the unnecessary inline here while we're here.

Will do.

>
> > +                              struct proc_maps_locking_ctx *lock_ctx)
> >  {
> >       rcu_read_lock();
> >       reset_lock_ctx(lock_ctx);
> >
> > -     return true;
> > +     return 0;
> >  }
> >
> >  static inline void unlock_vma_range(struct proc_maps_locking_ctx *lock_ctx)
> > @@ -245,10 +245,10 @@ static inline void unlock_ctx_mm(struct proc_maps_locking_ctx *lock_ctx)
> >       mmap_read_unlock(lock_ctx->mm);
> >  }
> >
> > -static inline bool lock_vma_range(struct seq_file *m,
> > +static inline int lock_vma_range(struct seq_file *m,
>
> Same comment as above.
>
> >                                 struct proc_maps_locking_ctx *lock_ctx)
> >  {
> > -     return lock_ctx_mm(lock_ctx) == 0;
> > +     return lock_ctx_mm(lock_ctx);
> >  }
> >
> >  static inline void unlock_vma_range(struct proc_maps_locking_ctx *lock_ctx)
> > @@ -311,6 +311,7 @@ static void *m_start(struct seq_file *m, loff_t *ppos)
> >       struct proc_maps_locking_ctx *lock_ctx;
> >       loff_t last_addr = *ppos;
> >       struct mm_struct *mm;
> > +     int err;
> >
> >       /* See m_next(). Zero at the start or after lseek. */
> >       if (last_addr == SENTINEL_VMA_END)
> > @@ -328,11 +329,12 @@ static void *m_start(struct seq_file *m, loff_t *ppos)
> >               return NULL;
> >       }
> >
> > -     if (!lock_vma_range(m, lock_ctx)) {
> > +     err = lock_vma_range(m, lock_ctx);
> > +     if (err) {
> >               mmput(mm);
> >               put_task_struct(priv->task);
> >               priv->task = NULL;
> > -             return ERR_PTR(-EINTR);
> > +             return ERR_PTR(err);
> >       }
> >
> >       /*
> >
> > base-commit: e178a530a81621a29efbca49b3b78202a18236e4
> > --
> > 2.54.0.1032.g2f8565e1d1-goog
> >


^ permalink raw reply	[flat|nested] 17+ messages in thread

end of thread, other threads:[~2026-06-09 17:13 UTC | newest]

Thread overview: 17+ messages (download: mbox.gz follow: Atom feed
-- links below jump to the message on this page --
2026-06-06  1:57 [PATCH 1/2] fs/proc/task_mmu: change lock_vma_range() to return error code Suren Baghdasaryan
2026-06-06  1:57 ` [PATCH 2/2] fs/proc/task_mmu: read proc/pid/smaps_rollup under per-vma lock Suren Baghdasaryan
2026-06-06  2:00   ` Suren Baghdasaryan
2026-06-06  8:12   ` Lorenzo Stoakes
2026-06-07 19:45     ` Suren Baghdasaryan
2026-06-08  8:14       ` Lorenzo Stoakes
2026-06-08 15:32         ` David Hildenbrand (Arm)
2026-06-08 15:44           ` Suren Baghdasaryan
2026-06-08 15:43         ` Suren Baghdasaryan
2026-06-08 15:52   ` David Hildenbrand (Arm)
2026-06-08 16:20     ` Suren Baghdasaryan
2026-06-09 10:00   ` Lorenzo Stoakes
2026-06-09 17:11     ` Suren Baghdasaryan
2026-06-08 15:38 ` [PATCH 1/2] fs/proc/task_mmu: change lock_vma_range() to return error code David Hildenbrand (Arm)
2026-06-08 15:43   ` Suren Baghdasaryan
2026-06-09  8:27 ` Lorenzo Stoakes
2026-06-09 17:13   ` Suren Baghdasaryan

This is an external index of several public inboxes,
see mirroring instructions on how to clone and mirror
all data and code used by this external index.