[PATCH RFC v2] mm: use per_vma lock for MADV

linux-mm.kvack.org archive mirror
 help / color / mirror / Atom feed

* [PATCH RFC v2] mm: use per_vma lock for MADV_DONTNEED
@ 2025-05-30 10:44 Barry Song
  2025-05-30 14:06 ` Jann Horn
  2025-06-03 18:43 ` Lorenzo Stoakes
  0 siblings, 2 replies; 35+ messages in thread
From: Barry Song @ 2025-05-30 10:44 UTC (permalink / raw)
  To: akpm, linux-mm
  Cc: linux-kernel, Barry Song, Liam R. Howlett, Lorenzo Stoakes,
	David Hildenbrand, Vlastimil Babka, Jann Horn, Suren Baghdasaryan,
	Lokesh Gidra, Tangquan Zheng

From: Barry Song <v-songbaohua@oppo.com>

Certain madvise operations, especially MADV_DONTNEED, occur far more
frequently than other madvise options, particularly in native and Java
heaps for dynamic memory management.

Currently, the mmap_lock is always held during these operations, even when
unnecessary. This causes lock contention and can lead to severe priority
inversion, where low-priority threads—such as Android's HeapTaskDaemon—
hold the lock and block higher-priority threads.

This patch enables the use of per-VMA locks when the advised range lies
entirely within a single VMA, avoiding the need for full VMA traversal. In
practice, userspace heaps rarely issue MADV_DONTNEED across multiple VMAs.

Tangquan’s testing shows that over 99.5% of memory reclaimed by Android
benefits from this per-VMA lock optimization. After extended runtime,
217,735 madvise calls from HeapTaskDaemon used the per-VMA path, while
only 1,231 fell back to mmap_lock.

To simplify handling, the implementation falls back to the standard
mmap_lock if userfaultfd is enabled on the VMA, avoiding the complexity of
userfaultfd_remove().

Cc: "Liam R. Howlett" <Liam.Howlett@oracle.com>
Cc: Lorenzo Stoakes <lorenzo.stoakes@oracle.com>
Cc: David Hildenbrand <david@redhat.com>
Cc: Vlastimil Babka <vbabka@suse.cz>
Cc: Jann Horn <jannh@google.com>
Cc: Suren Baghdasaryan <surenb@google.com>
Cc: Lokesh Gidra <lokeshgidra@google.com>
Cc: Tangquan Zheng <zhengtangquan@oppo.com>
Signed-off-by: Barry Song <v-songbaohua@oppo.com>
---
 -v2:
 * try to hide the per-vma lock in madvise_lock, per Lorenzo;
 * ideally, for vector_madvise(), we are able to make the
   decision of lock types for each iteration; for this moment,
   we still use the global lock.
 -v1:
 https://lore.kernel.org/linux-mm/20250527044145.13153-1-21cnbao@gmail.com/

 mm/madvise.c | 79 ++++++++++++++++++++++++++++++++++++++++++++++------
 1 file changed, 70 insertions(+), 9 deletions(-)

diff --git a/mm/madvise.c b/mm/madvise.c
index 8433ac9b27e0..d408ffa404b3 100644
--- a/mm/madvise.c
+++ b/mm/madvise.c
@@ -51,6 +51,7 @@ struct madvise_walk_private {
 struct madvise_behavior {
 	int behavior;
 	struct mmu_gather *tlb;
+	struct vm_area_struct *vma;
 };
 
 /*
@@ -1553,6 +1554,21 @@ int madvise_walk_vmas(struct mm_struct *mm, unsigned long start,
 	return unmapped_error;
 }
 
+/*
+ * Call the visit function on the single vma with the per_vma lock
+ */
+static inline
+int madvise_single_locked_vma(struct vm_area_struct *vma,
+		unsigned long start, unsigned long end, void *arg,
+		int (*visit)(struct vm_area_struct *vma,
+				   struct vm_area_struct **prev, unsigned long start,
+				   unsigned long end, void *arg))
+{
+	struct vm_area_struct *prev;
+
+	return visit(vma, &prev, start, end, arg);
+}
+
 #ifdef CONFIG_ANON_VMA_NAME
 static int madvise_vma_anon_name(struct vm_area_struct *vma,
 				 struct vm_area_struct **prev,
@@ -1603,7 +1619,7 @@ int madvise_set_anon_name(struct mm_struct *mm, unsigned long start,
 }
 #endif /* CONFIG_ANON_VMA_NAME */
 
-static int madvise_lock(struct mm_struct *mm, int behavior)
+static int __madvise_lock(struct mm_struct *mm, int behavior)
 {
 	if (is_memory_failure(behavior))
 		return 0;
@@ -1617,7 +1633,7 @@ static int madvise_lock(struct mm_struct *mm, int behavior)
 	return 0;
 }
 
-static void madvise_unlock(struct mm_struct *mm, int behavior)
+static void __madvise_unlock(struct mm_struct *mm, int behavior)
 {
 	if (is_memory_failure(behavior))
 		return;
@@ -1628,6 +1644,46 @@ static void madvise_unlock(struct mm_struct *mm, int behavior)
 		mmap_read_unlock(mm);
 }
 
+static int madvise_lock(struct mm_struct *mm, unsigned long start,
+		unsigned long len, struct madvise_behavior *madv_behavior)
+{
+	int behavior = madv_behavior->behavior;
+
+	/*
+	 * MADV_DONTNEED is commonly used with userspace heaps and most often
+	 * affects a single VMA. In these cases, we can use per-VMA locks to
+	 * reduce contention on the mmap_lock.
+	 */
+	if (behavior == MADV_DONTNEED || behavior == MADV_DONTNEED_LOCKED) {
+		struct vm_area_struct *vma;
+		unsigned long end;
+
+		start = untagged_addr(start);
+		end = start + len;
+		vma = lock_vma_under_rcu(mm, start);
+		if (!vma)
+			goto out;
+		if (end > vma->vm_end || userfaultfd_armed(vma)) {
+			vma_end_read(vma);
+			goto out;
+		}
+		madv_behavior->vma = vma;
+		return 0;
+	}
+
+out:
+	return __madvise_lock(mm, behavior);
+}
+
+static void madvise_unlock(struct mm_struct *mm,
+		struct madvise_behavior *madv_behavior)
+{
+	if (madv_behavior->vma)
+		vma_end_read(madv_behavior->vma);
+	else
+		__madvise_unlock(mm, madv_behavior->behavior);
+}
+
 static bool madvise_batch_tlb_flush(int behavior)
 {
 	switch (behavior) {
@@ -1714,19 +1770,24 @@ static int madvise_do_behavior(struct mm_struct *mm,
 		unsigned long start, size_t len_in,
 		struct madvise_behavior *madv_behavior)
 {
+	struct vm_area_struct *vma = madv_behavior->vma;
 	int behavior = madv_behavior->behavior;
+
 	struct blk_plug plug;
 	unsigned long end;
 	int error;
 
 	if (is_memory_failure(behavior))
 		return madvise_inject_error(behavior, start, start + len_in);
-	start = untagged_addr_remote(mm, start);
+	start = untagged_addr(start);
 	end = start + PAGE_ALIGN(len_in);
 
 	blk_start_plug(&plug);
 	if (is_madvise_populate(behavior))
 		error = madvise_populate(mm, start, end, behavior);
+	else if (vma)
+		error = madvise_single_locked_vma(vma, start, end,
+				madv_behavior, madvise_vma_behavior);
 	else
 		error = madvise_walk_vmas(mm, start, end, madv_behavior,
 					  madvise_vma_behavior);
@@ -1817,13 +1878,13 @@ int do_madvise(struct mm_struct *mm, unsigned long start, size_t len_in, int beh
 
 	if (madvise_should_skip(start, len_in, behavior, &error))
 		return error;
-	error = madvise_lock(mm, behavior);
+	error = madvise_lock(mm, start, len_in, &madv_behavior);
 	if (error)
 		return error;
 	madvise_init_tlb(&madv_behavior, mm);
 	error = madvise_do_behavior(mm, start, len_in, &madv_behavior);
 	madvise_finish_tlb(&madv_behavior);
-	madvise_unlock(mm, behavior);
+	madvise_unlock(mm, &madv_behavior);
 
 	return error;
 }
@@ -1847,7 +1908,7 @@ static ssize_t vector_madvise(struct mm_struct *mm, struct iov_iter *iter,
 
 	total_len = iov_iter_count(iter);
 
-	ret = madvise_lock(mm, behavior);
+	ret = __madvise_lock(mm, behavior);
 	if (ret)
 		return ret;
 	madvise_init_tlb(&madv_behavior, mm);
@@ -1880,8 +1941,8 @@ static ssize_t vector_madvise(struct mm_struct *mm, struct iov_iter *iter,
 
 			/* Drop and reacquire lock to unwind race. */
 			madvise_finish_tlb(&madv_behavior);
-			madvise_unlock(mm, behavior);
-			madvise_lock(mm, behavior);
+			__madvise_unlock(mm, behavior);
+			__madvise_lock(mm, behavior);
 			madvise_init_tlb(&madv_behavior, mm);
 			continue;
 		}
@@ -1890,7 +1951,7 @@ static ssize_t vector_madvise(struct mm_struct *mm, struct iov_iter *iter,
 		iov_iter_advance(iter, iter_iov_len(iter));
 	}
 	madvise_finish_tlb(&madv_behavior);
-	madvise_unlock(mm, behavior);
+	__madvise_unlock(mm, behavior);
 
 	ret = (total_len - iov_iter_count(iter)) ? : ret;
 
-- 
2.39.3 (Apple Git-146)



^ permalink raw reply related	[flat|nested] 35+ messages in thread

* Re: [PATCH RFC v2] mm: use per_vma lock for MADV_DONTNEED
  2025-05-30 10:44 [PATCH RFC v2] mm: use per_vma lock for MADV_DONTNEED Barry Song
@ 2025-05-30 14:06 ` Jann Horn
  2025-05-30 14:34   ` Lorenzo Stoakes
                     ` (2 more replies)
  2025-06-03 18:43 ` Lorenzo Stoakes
  1 sibling, 3 replies; 35+ messages in thread
From: Jann Horn @ 2025-05-30 14:06 UTC (permalink / raw)
  To: Barry Song
  Cc: akpm, linux-mm, linux-kernel, Barry Song, Liam R. Howlett,
	Lorenzo Stoakes, David Hildenbrand, Vlastimil Babka,
	Suren Baghdasaryan, Lokesh Gidra, Tangquan Zheng

On Fri, May 30, 2025 at 12:44 PM Barry Song <21cnbao@gmail.com> wrote:
> Certain madvise operations, especially MADV_DONTNEED, occur far more
> frequently than other madvise options, particularly in native and Java
> heaps for dynamic memory management.
>
> Currently, the mmap_lock is always held during these operations, even when
> unnecessary. This causes lock contention and can lead to severe priority
> inversion, where low-priority threads—such as Android's HeapTaskDaemon—
> hold the lock and block higher-priority threads.
>
> This patch enables the use of per-VMA locks when the advised range lies
> entirely within a single VMA, avoiding the need for full VMA traversal. In
> practice, userspace heaps rarely issue MADV_DONTNEED across multiple VMAs.
>
> Tangquan’s testing shows that over 99.5% of memory reclaimed by Android
> benefits from this per-VMA lock optimization. After extended runtime,
> 217,735 madvise calls from HeapTaskDaemon used the per-VMA path, while
> only 1,231 fell back to mmap_lock.
>
> To simplify handling, the implementation falls back to the standard
> mmap_lock if userfaultfd is enabled on the VMA, avoiding the complexity of
> userfaultfd_remove().

One important quirk of this is that it can, from what I can see, cause
freeing of page tables (through pt_reclaim) without holding the mmap
lock at all:

do_madvise [behavior=MADV_DONTNEED]
  madvise_lock
    lock_vma_under_rcu
  madvise_do_behavior
    madvise_single_locked_vma
      madvise_vma_behavior
        madvise_dontneed_free
          madvise_dontneed_single_vma
            zap_page_range_single_batched [.reclaim_pt = true]
              unmap_single_vma
                unmap_page_range
                  zap_p4d_range
                    zap_pud_range
                      zap_pmd_range
                        zap_pte_range
                          try_get_and_clear_pmd
                          free_pte

This clashes with the assumption in walk_page_range_novma() that
holding the mmap lock in write mode is sufficient to prevent
concurrent page table freeing, so it can probably lead to page table
UAF through the ptdump interface (see ptdump_walk_pgd()).

I think before this patch can land, you'll have to introduce some new
helper like:

void mmap_write_lock_with_all_vmas(struct mm_struct *mm)
{
  mmap_write_lock(mm);
  for_each_vma(vmi, vma)
    vma_start_write(vma);
}

and use that in walk_page_range_novma() for user virtual address space
walks, and update the comment in there.

> Cc: "Liam R. Howlett" <Liam.Howlett@oracle.com>
> Cc: Lorenzo Stoakes <lorenzo.stoakes@oracle.com>
> Cc: David Hildenbrand <david@redhat.com>
> Cc: Vlastimil Babka <vbabka@suse.cz>
> Cc: Jann Horn <jannh@google.com>
> Cc: Suren Baghdasaryan <surenb@google.com>
> Cc: Lokesh Gidra <lokeshgidra@google.com>
> Cc: Tangquan Zheng <zhengtangquan@oppo.com>
> Signed-off-by: Barry Song <v-songbaohua@oppo.com>
[...]
> +static void madvise_unlock(struct mm_struct *mm,
> +               struct madvise_behavior *madv_behavior)
> +{
> +       if (madv_behavior->vma)
> +               vma_end_read(madv_behavior->vma);

Please set madv_behavior->vma to NULL here, so that if madvise_lock()
was called on madv_behavior again and decided to take the mmap lock
that time, the next madvise_unlock() wouldn't take the wrong branch
here.

> +       else
> +               __madvise_unlock(mm, madv_behavior->behavior);
> +}
> +
>  static bool madvise_batch_tlb_flush(int behavior)
>  {
>         switch (behavior) {
> @@ -1714,19 +1770,24 @@ static int madvise_do_behavior(struct mm_struct *mm,
>                 unsigned long start, size_t len_in,
>                 struct madvise_behavior *madv_behavior)
>  {
> +       struct vm_area_struct *vma = madv_behavior->vma;
>         int behavior = madv_behavior->behavior;
> +
>         struct blk_plug plug;
>         unsigned long end;
>         int error;
>
>         if (is_memory_failure(behavior))
>                 return madvise_inject_error(behavior, start, start + len_in);
> -       start = untagged_addr_remote(mm, start);
> +       start = untagged_addr(start);

Why is this okay? I see that X86's untagged_addr_remote() asserts that
the mmap lock is held, which is no longer the case here with your
patch, but untagged_addr() seems wrong here, since we can be operating
on another process. I think especially on X86 with 5-level paging and
LAM, there can probably be cases where address bits are used for part
of the virtual address in one task while they need to be masked off in
another task?

I wonder if you'll have to refactor X86 and Risc-V first to make this
work... ideally by making sure that their address tagging state
updates are atomic and untagged_area_remote() works locklessly.

(Or you could try to use something like the
mmap_write_lock_with_all_vmas() I proposed above for synchronizing
against untagged_addr(), first write-lock the MM and then write-lock
all VMAs in it...)

>         end = start + PAGE_ALIGN(len_in);
>
>         blk_start_plug(&plug);
>         if (is_madvise_populate(behavior))
>                 error = madvise_populate(mm, start, end, behavior);
> +       else if (vma)
> +               error = madvise_single_locked_vma(vma, start, end,
> +                               madv_behavior, madvise_vma_behavior);
>         else
>                 error = madvise_walk_vmas(mm, start, end, madv_behavior,
>                                           madvise_vma_behavior);
> @@ -1847,7 +1908,7 @@ static ssize_t vector_madvise(struct mm_struct *mm, struct iov_iter *iter,
>
>         total_len = iov_iter_count(iter);
>
> -       ret = madvise_lock(mm, behavior);
> +       ret = __madvise_lock(mm, behavior);
>         if (ret)
>                 return ret;
>         madvise_init_tlb(&madv_behavior, mm);

(I think Lorenzo was saying that he would like madvise_lock() to also
be used in vector_madvise()? But I'll let him comment on that.)


^ permalink raw reply	[flat|nested] 35+ messages in thread

* Re: [PATCH RFC v2] mm: use per_vma lock for MADV_DONTNEED
  2025-05-30 14:06 ` Jann Horn
@ 2025-05-30 14:34   ` Lorenzo Stoakes
  2025-05-30 20:17     ` Barry Song
  2025-05-30 20:40     ` Jann Horn
  2025-05-30 22:00   ` Barry Song
  2025-06-03  7:24   ` Qi Zheng
  2 siblings, 2 replies; 35+ messages in thread
From: Lorenzo Stoakes @ 2025-05-30 14:34 UTC (permalink / raw)
  To: Jann Horn
  Cc: Barry Song, akpm, linux-mm, linux-kernel, Barry Song,
	Liam R. Howlett, David Hildenbrand, Vlastimil Babka,
	Suren Baghdasaryan, Lokesh Gidra, Tangquan Zheng

Barry - I was going to come back to this later, but Jann's sort of bumped
this in my inbox.

This implementation isn't quite what I was after, would you give me a
little bit before a respin so I can have a think about this and make
sensible suggestions?

Thanks!

On Fri, May 30, 2025 at 04:06:30PM +0200, Jann Horn wrote:
> On Fri, May 30, 2025 at 12:44 PM Barry Song <21cnbao@gmail.com> wrote:
> > Certain madvise operations, especially MADV_DONTNEED, occur far more
> > frequently than other madvise options, particularly in native and Java
> > heaps for dynamic memory management.
> >
> > Currently, the mmap_lock is always held during these operations, even when
> > unnecessary. This causes lock contention and can lead to severe priority
> > inversion, where low-priority threads—such as Android's HeapTaskDaemon—
> > hold the lock and block higher-priority threads.
> >
> > This patch enables the use of per-VMA locks when the advised range lies
> > entirely within a single VMA, avoiding the need for full VMA traversal. In
> > practice, userspace heaps rarely issue MADV_DONTNEED across multiple VMAs.
> >
> > Tangquan’s testing shows that over 99.5% of memory reclaimed by Android
> > benefits from this per-VMA lock optimization. After extended runtime,
> > 217,735 madvise calls from HeapTaskDaemon used the per-VMA path, while
> > only 1,231 fell back to mmap_lock.
> >
> > To simplify handling, the implementation falls back to the standard
> > mmap_lock if userfaultfd is enabled on the VMA, avoiding the complexity of
> > userfaultfd_remove().
>
> One important quirk of this is that it can, from what I can see, cause
> freeing of page tables (through pt_reclaim) without holding the mmap
> lock at all:
>
> do_madvise [behavior=MADV_DONTNEED]
>   madvise_lock
>     lock_vma_under_rcu
>   madvise_do_behavior
>     madvise_single_locked_vma
>       madvise_vma_behavior
>         madvise_dontneed_free
>           madvise_dontneed_single_vma
>             zap_page_range_single_batched [.reclaim_pt = true]
>               unmap_single_vma
>                 unmap_page_range
>                   zap_p4d_range
>                     zap_pud_range
>                       zap_pmd_range
>                         zap_pte_range
>                           try_get_and_clear_pmd
>                           free_pte
>
> This clashes with the assumption in walk_page_range_novma() that
> holding the mmap lock in write mode is sufficient to prevent
> concurrent page table freeing, so it can probably lead to page table
> UAF through the ptdump interface (see ptdump_walk_pgd()).

Hmmmmmm is this because of the series that allows page table freeing on
zap... I think Zi's?

We need to update the documentation on this then... which currently states
the VMA need only be stable.

I guess this is still the case except for the novma walker you mention.

Relatedly, It's worth looking at Dev's series which introduces a concerning
new 'no lock at all' mode to the page table walker explicitly for novma. I
cc'd you :) See [0].

[0]: https://lore.kernel.org/linux-mm/6a60c052-9935-489e-a38e-1b03a1a79155@lucifer.local/

>
> I think before this patch can land, you'll have to introduce some new
> helper like:
>
> void mmap_write_lock_with_all_vmas(struct mm_struct *mm)
> {
>   mmap_write_lock(mm);
>   for_each_vma(vmi, vma)
>     vma_start_write(vma);
> }
>
> and use that in walk_page_range_novma() for user virtual address space
> walks, and update the comment in there.

What dude? No, what? Marking literally all VMAs write locked? :/

I think this could have unexpected impact no? We're basically disabling VMA
locking when we're in novma, that seems... really silly?


>
> > Cc: "Liam R. Howlett" <Liam.Howlett@oracle.com>
> > Cc: Lorenzo Stoakes <lorenzo.stoakes@oracle.com>
> > Cc: David Hildenbrand <david@redhat.com>
> > Cc: Vlastimil Babka <vbabka@suse.cz>
> > Cc: Jann Horn <jannh@google.com>
> > Cc: Suren Baghdasaryan <surenb@google.com>
> > Cc: Lokesh Gidra <lokeshgidra@google.com>
> > Cc: Tangquan Zheng <zhengtangquan@oppo.com>
> > Signed-off-by: Barry Song <v-songbaohua@oppo.com>
> [...]
> > +static void madvise_unlock(struct mm_struct *mm,
> > +               struct madvise_behavior *madv_behavior)
> > +{
> > +       if (madv_behavior->vma)
> > +               vma_end_read(madv_behavior->vma);
>
> Please set madv_behavior->vma to NULL here, so that if madvise_lock()
> was called on madv_behavior again and decided to take the mmap lock
> that time, the next madvise_unlock() wouldn't take the wrong branch
> here.

Yeah I'm not a fan of having the vma referenced here this isn't quite what
I meant.

>
> > +       else
> > +               __madvise_unlock(mm, madv_behavior->behavior);
> > +}
> > +
> >  static bool madvise_batch_tlb_flush(int behavior)
> >  {
> >         switch (behavior) {
> > @@ -1714,19 +1770,24 @@ static int madvise_do_behavior(struct mm_struct *mm,
> >                 unsigned long start, size_t len_in,
> >                 struct madvise_behavior *madv_behavior)
> >  {
> > +       struct vm_area_struct *vma = madv_behavior->vma;
> >         int behavior = madv_behavior->behavior;
> > +
> >         struct blk_plug plug;
> >         unsigned long end;
> >         int error;
> >
> >         if (is_memory_failure(behavior))
> >                 return madvise_inject_error(behavior, start, start + len_in);
> > -       start = untagged_addr_remote(mm, start);
> > +       start = untagged_addr(start);
>
> Why is this okay? I see that X86's untagged_addr_remote() asserts that
> the mmap lock is held, which is no longer the case here with your
> patch, but untagged_addr() seems wrong here, since we can be operating
> on another process. I think especially on X86 with 5-level paging and
> LAM, there can probably be cases where address bits are used for part
> of the virtual address in one task while they need to be masked off in
> another task?
>
> I wonder if you'll have to refactor X86 and Risc-V first to make this
> work... ideally by making sure that their address tagging state
> updates are atomic and untagged_area_remote() works locklessly.

Yeah I don't know why we're doing this at all? This seems new unless I
missed it?

>
> (Or you could try to use something like the
> mmap_write_lock_with_all_vmas() I proposed above for synchronizing
> against untagged_addr(), first write-lock the MM and then write-lock
> all VMAs in it...)

This would completely eliminate the point of this patch no? The whole point
is not taking these locks... And I'm very much not in favour of
write-locking literally every single VMA. under any circumstances.

>
> >         end = start + PAGE_ALIGN(len_in);
> >
> >         blk_start_plug(&plug);
> >         if (is_madvise_populate(behavior))
> >                 error = madvise_populate(mm, start, end, behavior);
> > +       else if (vma)
> > +               error = madvise_single_locked_vma(vma, start, end,
> > +                               madv_behavior, madvise_vma_behavior);
> >         else
> >                 error = madvise_walk_vmas(mm, start, end, madv_behavior,
> >                                           madvise_vma_behavior);
> > @@ -1847,7 +1908,7 @@ static ssize_t vector_madvise(struct mm_struct *mm, struct iov_iter *iter,
> >
> >         total_len = iov_iter_count(iter);
> >
> > -       ret = madvise_lock(mm, behavior);
> > +       ret = __madvise_lock(mm, behavior);
> >         if (ret)
> >                 return ret;
> >         madvise_init_tlb(&madv_behavior, mm);
>
> (I think Lorenzo was saying that he would like madvise_lock() to also
> be used in vector_madvise()? But I'll let him comment on that.)

Yeah I"m not a fan of this implementation it's not really what I had in
mind, as per top of mail.

Will come back with suggestions later.

Thanks!


^ permalink raw reply	[flat|nested] 35+ messages in thread

* Re: [PATCH RFC v2] mm: use per_vma lock for MADV_DONTNEED
  2025-05-30 14:34   ` Lorenzo Stoakes
@ 2025-05-30 20:17     ` Barry Song
  2025-06-02 17:35       ` SeongJae Park
  2025-05-30 20:40     ` Jann Horn
  1 sibling, 1 reply; 35+ messages in thread
From: Barry Song @ 2025-05-30 20:17 UTC (permalink / raw)
  To: Lorenzo Stoakes
  Cc: Jann Horn, akpm, linux-mm, linux-kernel, Barry Song,
	Liam R. Howlett, David Hildenbrand, Vlastimil Babka,
	Suren Baghdasaryan, Lokesh Gidra, Tangquan Zheng

On Fri, May 30, 2025 at 10:34 PM Lorenzo Stoakes
<lorenzo.stoakes@oracle.com> wrote:
>
> Barry - I was going to come back to this later, but Jann's sort of bumped
> this in my inbox.
>
> This implementation isn't quite what I was after, would you give me a
> little bit before a respin so I can have a think about this and make
> sensible suggestions?

Sure.

>
> Thanks!
>
> On Fri, May 30, 2025 at 04:06:30PM +0200, Jann Horn wrote:
> > On Fri, May 30, 2025 at 12:44 PM Barry Song <21cnbao@gmail.com> wrote:
> > > Certain madvise operations, especially MADV_DONTNEED, occur far more
> > > frequently than other madvise options, particularly in native and Java
> > > heaps for dynamic memory management.
> > >
> > > Currently, the mmap_lock is always held during these operations, even when
> > > unnecessary. This causes lock contention and can lead to severe priority
> > > inversion, where low-priority threads—such as Android's HeapTaskDaemon—
> > > hold the lock and block higher-priority threads.
> > >
> > > This patch enables the use of per-VMA locks when the advised range lies
> > > entirely within a single VMA, avoiding the need for full VMA traversal. In
> > > practice, userspace heaps rarely issue MADV_DONTNEED across multiple VMAs.
> > >
> > > Tangquan’s testing shows that over 99.5% of memory reclaimed by Android
> > > benefits from this per-VMA lock optimization. After extended runtime,
> > > 217,735 madvise calls from HeapTaskDaemon used the per-VMA path, while
> > > only 1,231 fell back to mmap_lock.
> > >
> > > To simplify handling, the implementation falls back to the standard
> > > mmap_lock if userfaultfd is enabled on the VMA, avoiding the complexity of
> > > userfaultfd_remove().
> >
> > One important quirk of this is that it can, from what I can see, cause
> > freeing of page tables (through pt_reclaim) without holding the mmap
> > lock at all:
> >
> > do_madvise [behavior=MADV_DONTNEED]
> >   madvise_lock
> >     lock_vma_under_rcu
> >   madvise_do_behavior
> >     madvise_single_locked_vma
> >       madvise_vma_behavior
> >         madvise_dontneed_free
> >           madvise_dontneed_single_vma
> >             zap_page_range_single_batched [.reclaim_pt = true]
> >               unmap_single_vma
> >                 unmap_page_range
> >                   zap_p4d_range
> >                     zap_pud_range
> >                       zap_pmd_range
> >                         zap_pte_range
> >                           try_get_and_clear_pmd
> >                           free_pte
> >
> > This clashes with the assumption in walk_page_range_novma() that
> > holding the mmap lock in write mode is sufficient to prevent
> > concurrent page table freeing, so it can probably lead to page table
> > UAF through the ptdump interface (see ptdump_walk_pgd()).
>
> Hmmmmmm is this because of the series that allows page table freeing on
> zap... I think Zi's?
>
> We need to update the documentation on this then... which currently states
> the VMA need only be stable.
>
> I guess this is still the case except for the novma walker you mention.
>
> Relatedly, It's worth looking at Dev's series which introduces a concerning
> new 'no lock at all' mode to the page table walker explicitly for novma. I
> cc'd you :) See [0].
>
> [0]: https://lore.kernel.org/linux-mm/6a60c052-9935-489e-a38e-1b03a1a79155@lucifer.local/
>
> >
> > I think before this patch can land, you'll have to introduce some new
> > helper like:
> >
> > void mmap_write_lock_with_all_vmas(struct mm_struct *mm)
> > {
> >   mmap_write_lock(mm);
> >   for_each_vma(vmi, vma)
> >     vma_start_write(vma);
> > }
> >
> > and use that in walk_page_range_novma() for user virtual address space
> > walks, and update the comment in there.
>
> What dude? No, what? Marking literally all VMAs write locked? :/
>
> I think this could have unexpected impact no? We're basically disabling VMA
> locking when we're in novma, that seems... really silly?
>
>
> >
> > > Cc: "Liam R. Howlett" <Liam.Howlett@oracle.com>
> > > Cc: Lorenzo Stoakes <lorenzo.stoakes@oracle.com>
> > > Cc: David Hildenbrand <david@redhat.com>
> > > Cc: Vlastimil Babka <vbabka@suse.cz>
> > > Cc: Jann Horn <jannh@google.com>
> > > Cc: Suren Baghdasaryan <surenb@google.com>
> > > Cc: Lokesh Gidra <lokeshgidra@google.com>
> > > Cc: Tangquan Zheng <zhengtangquan@oppo.com>
> > > Signed-off-by: Barry Song <v-songbaohua@oppo.com>
> > [...]
> > > +static void madvise_unlock(struct mm_struct *mm,
> > > +               struct madvise_behavior *madv_behavior)
> > > +{
> > > +       if (madv_behavior->vma)
> > > +               vma_end_read(madv_behavior->vma);
> >
> > Please set madv_behavior->vma to NULL here, so that if madvise_lock()
> > was called on madv_behavior again and decided to take the mmap lock
> > that time, the next madvise_unlock() wouldn't take the wrong branch
> > here.

i actually put some words for vector_madvise:
"
 * ideally, for vector_madvise(), we are able to make the
   decision of lock types for each iteration; for this moment,
   we still use the global lock."

I held on to that one because I'd rather get feedback before going too
far - so vector_madvise() didn't be touched by having a __madvise_lock()
and __madvise_lock().

For that case, we might need to take madvise_lock after releasing it.
otherwise, this is not the case.

BTW, I found vector_madvise doesn't check the ret value of madvise_lock(),
it seems also a bug?

static ssize_t vector_madvise(struct mm_struct *mm, struct iov_iter *iter,
{
                        /* Drop and reacquire lock to unwind race. */
                        madvise_finish_tlb(&madv_behavior);
                       madvise_unlock(mm, behavior);
                       madvise_lock(mm, behavior);  /* missing the ret check */
                        madvise_init_tlb(&madv_behavior, mm)
}

>
> Yeah I'm not a fan of having the vma referenced here this isn't quite what
> I meant.
>
> >
> > > +       else
> > > +               __madvise_unlock(mm, madv_behavior->behavior);
> > > +}
> > > +
> > >  static bool madvise_batch_tlb_flush(int behavior)
> > >  {
> > >         switch (behavior) {
> > > @@ -1714,19 +1770,24 @@ static int madvise_do_behavior(struct mm_struct *mm,
> > >                 unsigned long start, size_t len_in,
> > >                 struct madvise_behavior *madv_behavior)
> > >  {
> > > +       struct vm_area_struct *vma = madv_behavior->vma;
> > >         int behavior = madv_behavior->behavior;
> > > +
> > >         struct blk_plug plug;
> > >         unsigned long end;
> > >         int error;
> > >
> > >         if (is_memory_failure(behavior))
> > >                 return madvise_inject_error(behavior, start, start + len_in);
> > > -       start = untagged_addr_remote(mm, start);
> > > +       start = untagged_addr(start);
> >
> > Why is this okay? I see that X86's untagged_addr_remote() asserts that
> > the mmap lock is held, which is no longer the case here with your
> > patch, but untagged_addr() seems wrong here, since we can be operating
> > on another process. I think especially on X86 with 5-level paging and
> > LAM, there can probably be cases where address bits are used for part
> > of the virtual address in one task while they need to be masked off in
> > another task?
> >
> > I wonder if you'll have to refactor X86 and Risc-V first to make this
> > work... ideally by making sure that their address tagging state
> > updates are atomic and untagged_area_remote() works locklessly.
>
> Yeah I don't know why we're doing this at all? This seems new unless I
> missed it?

we might call madvise_do_behavior() within per-vma lock but
untagged_addr_remote() always asserts a mmap_lock which
will also be asserted by find_vma in madvise_walk_vmas().
so at least for architectures other than risc-v and x86, there
is no difference.

include/linux/uaccess.h

#ifndef untagged_addr_remote
#define untagged_addr_remote(mm, addr) ({ \
              mmap_assert_locked(mm); \
              untagged_addr(addr); \
})
#endif

I didn't realize madv_dontneed could be done on a remote process,
could it?

>
> >
> > (Or you could try to use something like the
> > mmap_write_lock_with_all_vmas() I proposed above for synchronizing
> > against untagged_addr(), first write-lock the MM and then write-lock
> > all VMAs in it...)
>
> This would completely eliminate the point of this patch no? The whole point
> is not taking these locks... And I'm very much not in favour of
> write-locking literally every single VMA. under any circumstances.
>
> >
> > >         end = start + PAGE_ALIGN(len_in);
> > >
> > >         blk_start_plug(&plug);
> > >         if (is_madvise_populate(behavior))
> > >                 error = madvise_populate(mm, start, end, behavior);
> > > +       else if (vma)
> > > +               error = madvise_single_locked_vma(vma, start, end,
> > > +                               madv_behavior, madvise_vma_behavior);
> > >         else
> > >                 error = madvise_walk_vmas(mm, start, end, madv_behavior,
> > >                                           madvise_vma_behavior);
> > > @@ -1847,7 +1908,7 @@ static ssize_t vector_madvise(struct mm_struct *mm, struct iov_iter *iter,
> > >
> > >         total_len = iov_iter_count(iter);
> > >
> > > -       ret = madvise_lock(mm, behavior);
> > > +       ret = __madvise_lock(mm, behavior);
> > >         if (ret)
> > >                 return ret;
> > >         madvise_init_tlb(&madv_behavior, mm);
> >
> > (I think Lorenzo was saying that he would like madvise_lock() to also
> > be used in vector_madvise()? But I'll let him comment on that.)
>
> Yeah I"m not a fan of this implementation it's not really what I had in
> mind, as per top of mail.
>
> Will come back with suggestions later.
>

thanks!


> Thanks!

Barry


^ permalink raw reply	[flat|nested] 35+ messages in thread

* Re: [PATCH RFC v2] mm: use per_vma lock for MADV_DONTNEED
  2025-05-30 14:34   ` Lorenzo Stoakes
  2025-05-30 20:17     ` Barry Song
@ 2025-05-30 20:40     ` Jann Horn
  2025-06-02 11:50       ` Lorenzo Stoakes
  2025-06-03  7:06       ` Barry Song
  1 sibling, 2 replies; 35+ messages in thread
From: Jann Horn @ 2025-05-30 20:40 UTC (permalink / raw)
  To: Lorenzo Stoakes
  Cc: Barry Song, akpm, linux-mm, linux-kernel, Barry Song,
	Liam R. Howlett, David Hildenbrand, Vlastimil Babka,
	Suren Baghdasaryan, Lokesh Gidra, Tangquan Zheng

On Fri, May 30, 2025 at 4:34 PM Lorenzo Stoakes
<lorenzo.stoakes@oracle.com> wrote:
> Barry - I was going to come back to this later, but Jann's sort of bumped
> this in my inbox.
>
> This implementation isn't quite what I was after, would you give me a
> little bit before a respin so I can have a think about this and make
> sensible suggestions?
>
> Thanks!
>
> On Fri, May 30, 2025 at 04:06:30PM +0200, Jann Horn wrote:
> > On Fri, May 30, 2025 at 12:44 PM Barry Song <21cnbao@gmail.com> wrote:
> > One important quirk of this is that it can, from what I can see, cause
> > freeing of page tables (through pt_reclaim) without holding the mmap
> > lock at all:
> >
> > do_madvise [behavior=MADV_DONTNEED]
> >   madvise_lock
> >     lock_vma_under_rcu
> >   madvise_do_behavior
> >     madvise_single_locked_vma
> >       madvise_vma_behavior
> >         madvise_dontneed_free
> >           madvise_dontneed_single_vma
> >             zap_page_range_single_batched [.reclaim_pt = true]
> >               unmap_single_vma
> >                 unmap_page_range
> >                   zap_p4d_range
> >                     zap_pud_range
> >                       zap_pmd_range
> >                         zap_pte_range
> >                           try_get_and_clear_pmd
> >                           free_pte
> >
> > This clashes with the assumption in walk_page_range_novma() that
> > holding the mmap lock in write mode is sufficient to prevent
> > concurrent page table freeing, so it can probably lead to page table
> > UAF through the ptdump interface (see ptdump_walk_pgd()).
>
> Hmmmmmm is this because of the series that allows page table freeing on
> zap... I think Zi's?

Yeah, that was Qi Zheng's
https://lore.kernel.org/all/92aba2b319a734913f18ba41e7d86a265f0b84e2.1733305182.git.zhengqi.arch@bytedance.com/
.

> We need to update the documentation on this then... which currently states
> the VMA need only be stable.
>
> I guess this is still the case except for the novma walker you mention.
>
> Relatedly, It's worth looking at Dev's series which introduces a concerning
> new 'no lock at all' mode to the page table walker explicitly for novma. I
> cc'd you :) See [0].
>
> [0]: https://lore.kernel.org/linux-mm/6a60c052-9935-489e-a38e-1b03a1a79155@lucifer.local/

Yeah, I saw that you CC'ed me; at a first glance that seems relatively
innocuous to me as long as it's only done for kernel mappings where
all the rules are different.

>
> >
> > I think before this patch can land, you'll have to introduce some new
> > helper like:
> >
> > void mmap_write_lock_with_all_vmas(struct mm_struct *mm)
> > {
> >   mmap_write_lock(mm);
> >   for_each_vma(vmi, vma)
> >     vma_start_write(vma);
> > }
> >
> > and use that in walk_page_range_novma() for user virtual address space
> > walks, and update the comment in there.
>
> What dude? No, what? Marking literally all VMAs write locked? :/
>
> I think this could have unexpected impact no? We're basically disabling VMA
> locking when we're in novma, that seems... really silly?

I mean, walk_page_range_novma() being used on user virtual address
space is pretty much a debug-only thing, I don't think it matters if
it has to spend time poking flags in a few thousand VMAs. I guess the
alternative would be to say "ptdump just doesn't show entries between
VMAs, which shouldn't exist in the first place", and change ptdump to
do a normal walk that skips over userspace areas not covered by a VMA.
Maybe that's cleaner.

But FWIW, we already do worse than what I proposed here when
installing MMU notifiers, with mm_take_all_locks().

> > > +       else
> > > +               __madvise_unlock(mm, madv_behavior->behavior);
> > > +}
> > > +
> > >  static bool madvise_batch_tlb_flush(int behavior)
> > >  {
> > >         switch (behavior) {
> > > @@ -1714,19 +1770,24 @@ static int madvise_do_behavior(struct mm_struct *mm,
> > >                 unsigned long start, size_t len_in,
> > >                 struct madvise_behavior *madv_behavior)
> > >  {
> > > +       struct vm_area_struct *vma = madv_behavior->vma;
> > >         int behavior = madv_behavior->behavior;
> > > +
> > >         struct blk_plug plug;
> > >         unsigned long end;
> > >         int error;
> > >
> > >         if (is_memory_failure(behavior))
> > >                 return madvise_inject_error(behavior, start, start + len_in);
> > > -       start = untagged_addr_remote(mm, start);
> > > +       start = untagged_addr(start);
> >
> > Why is this okay? I see that X86's untagged_addr_remote() asserts that
> > the mmap lock is held, which is no longer the case here with your
> > patch, but untagged_addr() seems wrong here, since we can be operating
> > on another process. I think especially on X86 with 5-level paging and
> > LAM, there can probably be cases where address bits are used for part
> > of the virtual address in one task while they need to be masked off in
> > another task?
> >
> > I wonder if you'll have to refactor X86 and Risc-V first to make this
> > work... ideally by making sure that their address tagging state
> > updates are atomic and untagged_area_remote() works locklessly.
>
> Yeah I don't know why we're doing this at all? This seems new unless I
> missed it?

Because untagged_addr_remote() has a mmap_assert_locked(mm) on x86 and
reads data that is updated under the mmap lock, I think? So without
this change you should get a lockdep splat on x86.

> > (Or you could try to use something like the
> > mmap_write_lock_with_all_vmas() I proposed above for synchronizing
> > against untagged_addr(), first write-lock the MM and then write-lock
> > all VMAs in it...)
>
> This would completely eliminate the point of this patch no? The whole point
> is not taking these locks... And I'm very much not in favour of
> write-locking literally every single VMA. under any circumstances.

I'm talking about doing this heavyweight locking in places like
arch_prctl(ARCH_ENABLE_TAGGED_ADDR, ...) that can, if I understand
correctly, essentially reconfigure the size of the virtual address
space of a running process from 56-bit to 47-bit at the hardware level
and cause address bits that were previously part of the virtual
address to be ignored. READ_ONCE()/WRITE_ONCE() might do the job too,
but then we'll have to keep in mind that two subsequent invocations of
untagged_addr() can translate a userspace-specified virtual address
into two different virtual addresses at the page table level.


^ permalink raw reply	[flat|nested] 35+ messages in thread

* Re: [PATCH RFC v2] mm: use per_vma lock for MADV_DONTNEED
  2025-05-30 14:06 ` Jann Horn
  2025-05-30 14:34   ` Lorenzo Stoakes
@ 2025-05-30 22:00   ` Barry Song
  2025-06-02 14:55     ` Jann Horn
  2025-06-03  7:24   ` Qi Zheng
  2 siblings, 1 reply; 35+ messages in thread
From: Barry Song @ 2025-05-30 22:00 UTC (permalink / raw)
  To: Jann Horn
  Cc: akpm, linux-mm, linux-kernel, Barry Song, Liam R. Howlett,
	Lorenzo Stoakes, David Hildenbrand, Vlastimil Babka,
	Suren Baghdasaryan, Lokesh Gidra, Tangquan Zheng

On Fri, May 30, 2025 at 10:07 PM Jann Horn <jannh@google.com> wrote:
>
> On Fri, May 30, 2025 at 12:44 PM Barry Song <21cnbao@gmail.com> wrote:
> > Certain madvise operations, especially MADV_DONTNEED, occur far more
> > frequently than other madvise options, particularly in native and Java
> > heaps for dynamic memory management.
> >
> > Currently, the mmap_lock is always held during these operations, even when
> > unnecessary. This causes lock contention and can lead to severe priority
> > inversion, where low-priority threads—such as Android's HeapTaskDaemon—
> > hold the lock and block higher-priority threads.
> >
> > This patch enables the use of per-VMA locks when the advised range lies
> > entirely within a single VMA, avoiding the need for full VMA traversal. In
> > practice, userspace heaps rarely issue MADV_DONTNEED across multiple VMAs.
> >
> > Tangquan’s testing shows that over 99.5% of memory reclaimed by Android
> > benefits from this per-VMA lock optimization. After extended runtime,
> > 217,735 madvise calls from HeapTaskDaemon used the per-VMA path, while
> > only 1,231 fell back to mmap_lock.
> >
> > To simplify handling, the implementation falls back to the standard
> > mmap_lock if userfaultfd is enabled on the VMA, avoiding the complexity of
> > userfaultfd_remove().
>
> One important quirk of this is that it can, from what I can see, cause
> freeing of page tables (through pt_reclaim) without holding the mmap
> lock at all:
>
> do_madvise [behavior=MADV_DONTNEED]
>   madvise_lock
>     lock_vma_under_rcu
>   madvise_do_behavior
>     madvise_single_locked_vma
>       madvise_vma_behavior
>         madvise_dontneed_free
>           madvise_dontneed_single_vma
>             zap_page_range_single_batched [.reclaim_pt = true]
>               unmap_single_vma
>                 unmap_page_range
>                   zap_p4d_range
>                     zap_pud_range
>                       zap_pmd_range
>                         zap_pte_range
>                           try_get_and_clear_pmd
>                           free_pte
>
> This clashes with the assumption in walk_page_range_novma() that
> holding the mmap lock in write mode is sufficient to prevent
> concurrent page table freeing, so it can probably lead to page table
> UAF through the ptdump interface (see ptdump_walk_pgd()).
>
> I think before this patch can land, you'll have to introduce some new
> helper like:
>
> void mmap_write_lock_with_all_vmas(struct mm_struct *mm)
> {
>   mmap_write_lock(mm);
>   for_each_vma(vmi, vma)
>     vma_start_write(vma);
> }
>
> and use that in walk_page_range_novma() for user virtual address space
> walks, and update the comment in there.
>
> > Cc: "Liam R. Howlett" <Liam.Howlett@oracle.com>
> > Cc: Lorenzo Stoakes <lorenzo.stoakes@oracle.com>
> > Cc: David Hildenbrand <david@redhat.com>
> > Cc: Vlastimil Babka <vbabka@suse.cz>
> > Cc: Jann Horn <jannh@google.com>
> > Cc: Suren Baghdasaryan <surenb@google.com>
> > Cc: Lokesh Gidra <lokeshgidra@google.com>
> > Cc: Tangquan Zheng <zhengtangquan@oppo.com>
> > Signed-off-by: Barry Song <v-songbaohua@oppo.com>
> [...]
> > +static void madvise_unlock(struct mm_struct *mm,
> > +               struct madvise_behavior *madv_behavior)
> > +{
> > +       if (madv_behavior->vma)
> > +               vma_end_read(madv_behavior->vma);
>
> Please set madv_behavior->vma to NULL here, so that if madvise_lock()
> was called on madv_behavior again and decided to take the mmap lock
> that time, the next madvise_unlock() wouldn't take the wrong branch
> here.
>
> > +       else
> > +               __madvise_unlock(mm, madv_behavior->behavior);
> > +}
> > +
> >  static bool madvise_batch_tlb_flush(int behavior)
> >  {
> >         switch (behavior) {
> > @@ -1714,19 +1770,24 @@ static int madvise_do_behavior(struct mm_struct *mm,
> >                 unsigned long start, size_t len_in,
> >                 struct madvise_behavior *madv_behavior)
> >  {
> > +       struct vm_area_struct *vma = madv_behavior->vma;
> >         int behavior = madv_behavior->behavior;
> > +
> >         struct blk_plug plug;
> >         unsigned long end;
> >         int error;
> >
> >         if (is_memory_failure(behavior))
> >                 return madvise_inject_error(behavior, start, start + len_in);
> > -       start = untagged_addr_remote(mm, start);
> > +       start = untagged_addr(start);
>
> Why is this okay? I see that X86's untagged_addr_remote() asserts that
> the mmap lock is held, which is no longer the case here with your
> patch, but untagged_addr() seems wrong here, since we can be operating
> on another process. I think especially on X86 with 5-level paging and
> LAM, there can probably be cases where address bits are used for part
> of the virtual address in one task while they need to be masked off in
> another task?
>
> I wonder if you'll have to refactor X86 and Risc-V first to make this
> work... ideally by making sure that their address tagging state
> updates are atomic and untagged_area_remote() works locklessly.

If possible, can we try to avoid this at least for this stage? We all
agree that
a per-VMA lock for DONTNEED is long overdue. The main goal of the patch
is to drop the mmap_lock for high-frequency madvise operations like
MADV_DONTNEED and potentially MADV_FREE. For these two cases, it's highly
unlikely that one process would be managing the memory of another. In v2,
we're modifying common code, which is why we ended up here.

We could consider doing:

if (current->mm == mm)
       untagged_addr(start);
else
       untagged_addr_remote(mm, start);

As for remote madvise operations like MADV_COLD, until we resolve the
issue with untagged_addr_remote—which still requires mmap_lock—we can
defer consideration of remote madvise cases.


>
> (Or you could try to use something like the
> mmap_write_lock_with_all_vmas() I proposed above for synchronizing
> against untagged_addr(), first write-lock the MM and then write-lock
> all VMAs in it...)
>
> >         end = start + PAGE_ALIGN(len_in);
> >
> >         blk_start_plug(&plug);
> >         if (is_madvise_populate(behavior))
> >                 error = madvise_populate(mm, start, end, behavior);
> > +       else if (vma)
> > +               error = madvise_single_locked_vma(vma, start, end,
> > +                               madv_behavior, madvise_vma_behavior);
> >         else
> >                 error = madvise_walk_vmas(mm, start, end, madv_behavior,
> >                                           madvise_vma_behavior);
> > @@ -1847,7 +1908,7 @@ static ssize_t vector_madvise(struct mm_struct *mm, struct iov_iter *iter,
> >
> >         total_len = iov_iter_count(iter);
> >
> > -       ret = madvise_lock(mm, behavior);
> > +       ret = __madvise_lock(mm, behavior);
> >         if (ret)
> >                 return ret;
> >         madvise_init_tlb(&madv_behavior, mm);
>
> (I think Lorenzo was saying that he would like madvise_lock() to also
> be used in vector_madvise()? But I'll let him comment on that.)

Thanks
Barry


^ permalink raw reply	[flat|nested] 35+ messages in thread

* Re: [PATCH RFC v2] mm: use per_vma lock for MADV_DONTNEED
  2025-05-30 20:40     ` Jann Horn
@ 2025-06-02 11:50       ` Lorenzo Stoakes
  2025-06-03  1:06         ` Barry Song
  2025-06-03  7:06       ` Barry Song
  1 sibling, 1 reply; 35+ messages in thread
From: Lorenzo Stoakes @ 2025-06-02 11:50 UTC (permalink / raw)
  To: Jann Horn
  Cc: Barry Song, akpm, linux-mm, linux-kernel, Barry Song,
	Liam R. Howlett, David Hildenbrand, Vlastimil Babka,
	Suren Baghdasaryan, Lokesh Gidra, Tangquan Zheng

Barry - would you mind if I went off and wrote a quick patch to separate
walk_page_range_novma() into walk_page_range_kernel() and
walk_page_range_user_novma()?

I realise this is a pain, but I feel strongly that having them together is
a source of confusion and having us special case a -only used in ptdump-
case like this is not great.

It wouldn't include any VMA locking stuff from this patch, which you could
then rebase on that.

I think it'd make sense as a separate series and I can throw that out
fairly quickly...

But I don't want to step on any toes so just let me know!

On Fri, May 30, 2025 at 10:40:47PM +0200, Jann Horn wrote:
> On Fri, May 30, 2025 at 4:34 PM Lorenzo Stoakes
> <lorenzo.stoakes@oracle.com> wrote:
> > Barry - I was going to come back to this later, but Jann's sort of bumped
> > this in my inbox.
> >
> > This implementation isn't quite what I was after, would you give me a
> > little bit before a respin so I can have a think about this and make
> > sensible suggestions?
> >
> > Thanks!
> >
> > On Fri, May 30, 2025 at 04:06:30PM +0200, Jann Horn wrote:
> > > On Fri, May 30, 2025 at 12:44 PM Barry Song <21cnbao@gmail.com> wrote:
> > > One important quirk of this is that it can, from what I can see, cause
> > > freeing of page tables (through pt_reclaim) without holding the mmap
> > > lock at all:
> > >
> > > do_madvise [behavior=MADV_DONTNEED]
> > >   madvise_lock
> > >     lock_vma_under_rcu
> > >   madvise_do_behavior
> > >     madvise_single_locked_vma
> > >       madvise_vma_behavior
> > >         madvise_dontneed_free
> > >           madvise_dontneed_single_vma
> > >             zap_page_range_single_batched [.reclaim_pt = true]
> > >               unmap_single_vma
> > >                 unmap_page_range
> > >                   zap_p4d_range
> > >                     zap_pud_range
> > >                       zap_pmd_range
> > >                         zap_pte_range
> > >                           try_get_and_clear_pmd
> > >                           free_pte
> > >
> > > This clashes with the assumption in walk_page_range_novma() that
> > > holding the mmap lock in write mode is sufficient to prevent
> > > concurrent page table freeing, so it can probably lead to page table
> > > UAF through the ptdump interface (see ptdump_walk_pgd()).
> >
> > Hmmmmmm is this because of the series that allows page table freeing on
> > zap... I think Zi's?
>
> Yeah, that was Qi Zheng's
> https://lore.kernel.org/all/92aba2b319a734913f18ba41e7d86a265f0b84e2.1733305182.git.zhengqi.arch@bytedance.com/

Yeah need to reference in docs...

I will do this.

> .
>
> > We need to update the documentation on this then... which currently states
> > the VMA need only be stable.
> >
> > I guess this is still the case except for the novma walker you mention.
> >
> > Relatedly, It's worth looking at Dev's series which introduces a concerning
> > new 'no lock at all' mode to the page table walker explicitly for novma. I
> > cc'd you :) See [0].
> >
> > [0]: https://lore.kernel.org/linux-mm/6a60c052-9935-489e-a38e-1b03a1a79155@lucifer.local/
>
> Yeah, I saw that you CC'ed me; at a first glance that seems relatively
> innocuous to me as long as it's only done for kernel mappings where
> all the rules are different.

Right, but it needs locking down, no pun intended... ;) my review on that
front is over there.

It's only ok if you know it's ok to do. I don't like this function being
general for the really really weird case of user mappings (used in
literally 1 place) and the more common case of walking kernel mappings.

>
> >
> > >
> > > I think before this patch can land, you'll have to introduce some new
> > > helper like:
> > >
> > > void mmap_write_lock_with_all_vmas(struct mm_struct *mm)

At any rate I think this should make explicit it essentially disables VMA
locking.

So mm_take_all_vma_write_locks() maybe with a big comment saying 'this
essentially disables VMA locking' or something?

> > > {
> > >   mmap_write_lock(mm);
> > >   for_each_vma(vmi, vma)
> > >     vma_start_write(vma);
> > > }
> > >
> > > and use that in walk_page_range_novma() for user virtual address space
> > > walks, and update the comment in there.
> >
> > What dude? No, what? Marking literally all VMAs write locked? :/
> >
> > I think this could have unexpected impact no? We're basically disabling VMA
> > locking when we're in novma, that seems... really silly?
>
> I mean, walk_page_range_novma() being used on user virtual address
> space is pretty much a debug-only thing, I don't think it matters if
> it has to spend time poking flags in a few thousand VMAs. I guess the
> alternative would be to say "ptdump just doesn't show entries between
> VMAs, which shouldn't exist in the first place", and change ptdump to
> do a normal walk that skips over userspace areas not covered by a VMA.
> Maybe that's cleaner.

Yeah but I'm guessing the point of ptdump is to pick up on brokenness where
this might not be the case OR maybe there's some weird arches or features
that do, in fact, permit this... at any rate I think we should probably
continue to allow it.

But yeah see the above about separating out. Less egregious that way when
it's _very clearly_ a specific debug mode.

>
> But FWIW, we already do worse than what I proposed here when
> installing MMU notifiers, with mm_take_all_locks().

Yeah...

>
> > > > +       else
> > > > +               __madvise_unlock(mm, madv_behavior->behavior);
> > > > +}
> > > > +
> > > >  static bool madvise_batch_tlb_flush(int behavior)
> > > >  {
> > > >         switch (behavior) {
> > > > @@ -1714,19 +1770,24 @@ static int madvise_do_behavior(struct mm_struct *mm,
> > > >                 unsigned long start, size_t len_in,
> > > >                 struct madvise_behavior *madv_behavior)
> > > >  {
> > > > +       struct vm_area_struct *vma = madv_behavior->vma;
> > > >         int behavior = madv_behavior->behavior;
> > > > +
> > > >         struct blk_plug plug;
> > > >         unsigned long end;
> > > >         int error;
> > > >
> > > >         if (is_memory_failure(behavior))
> > > >                 return madvise_inject_error(behavior, start, start + len_in);
> > > > -       start = untagged_addr_remote(mm, start);
> > > > +       start = untagged_addr(start);
> > >
> > > Why is this okay? I see that X86's untagged_addr_remote() asserts that
> > > the mmap lock is held, which is no longer the case here with your
> > > patch, but untagged_addr() seems wrong here, since we can be operating
> > > on another process. I think especially on X86 with 5-level paging and
> > > LAM, there can probably be cases where address bits are used for part
> > > of the virtual address in one task while they need to be masked off in
> > > another task?
> > >
> > > I wonder if you'll have to refactor X86 and Risc-V first to make this
> > > work... ideally by making sure that their address tagging state
> > > updates are atomic and untagged_area_remote() works locklessly.
> >
> > Yeah I don't know why we're doing this at all? This seems new unless I
> > missed it?
>
> Because untagged_addr_remote() has a mmap_assert_locked(mm) on x86 and
> reads data that is updated under the mmap lock, I think? So without
> this change you should get a lockdep splat on x86.

Gawd.

Yeah sorry I missed that bit.

Obviously Barry ought to address your concerns here.

>
> > > (Or you could try to use something like the
> > > mmap_write_lock_with_all_vmas() I proposed above for synchronizing
> > > against untagged_addr(), first write-lock the MM and then write-lock
> > > all VMAs in it...)
> >
> > This would completely eliminate the point of this patch no? The whole point
> > is not taking these locks... And I'm very much not in favour of
> > write-locking literally every single VMA. under any circumstances.
>
> I'm talking about doing this heavyweight locking in places like
> arch_prctl(ARCH_ENABLE_TAGGED_ADDR, ...) that can, if I understand

An aside: this doesn't seem to be documented at
https://man7.org/linux/man-pages/man2/arch_prctl.2.html ... grumble
grumble.

> correctly, essentially reconfigure the size of the virtual address
> space of a running process from 56-bit to 47-bit at the hardware level
> and cause address bits that were previously part of the virtual
> address to be ignored. READ_ONCE()/WRITE_ONCE() might do the job too,
> but then we'll have to keep in mind that two subsequent invocations of
> untagged_addr() can translate a userspace-specified virtual address
> into two different virtual addresses at the page table level.

I am not familiar enough with tagged pointers to intelligently comment on
this myself... but we need to find a means to address obviously.


^ permalink raw reply	[flat|nested] 35+ messages in thread

* Re: [PATCH RFC v2] mm: use per_vma lock for MADV_DONTNEED
  2025-05-30 22:00   ` Barry Song
@ 2025-06-02 14:55     ` Jann Horn
  2025-06-03  7:51       ` Barry Song
  0 siblings, 1 reply; 35+ messages in thread
From: Jann Horn @ 2025-06-02 14:55 UTC (permalink / raw)
  To: Barry Song
  Cc: akpm, linux-mm, linux-kernel, Barry Song, Liam R. Howlett,
	Lorenzo Stoakes, David Hildenbrand, Vlastimil Babka,
	Suren Baghdasaryan, Lokesh Gidra, Tangquan Zheng

On Sat, May 31, 2025 at 12:00 AM Barry Song <21cnbao@gmail.com> wrote:
> On Fri, May 30, 2025 at 10:07 PM Jann Horn <jannh@google.com> wrote:
> > On Fri, May 30, 2025 at 12:44 PM Barry Song <21cnbao@gmail.com> wrote:
> > > @@ -1714,19 +1770,24 @@ static int madvise_do_behavior(struct mm_struct *mm,
> > >                 unsigned long start, size_t len_in,
> > >                 struct madvise_behavior *madv_behavior)
> > >  {
> > > +       struct vm_area_struct *vma = madv_behavior->vma;
> > >         int behavior = madv_behavior->behavior;
> > > +
> > >         struct blk_plug plug;
> > >         unsigned long end;
> > >         int error;
> > >
> > >         if (is_memory_failure(behavior))
> > >                 return madvise_inject_error(behavior, start, start + len_in);
> > > -       start = untagged_addr_remote(mm, start);
> > > +       start = untagged_addr(start);
> >
> > Why is this okay? I see that X86's untagged_addr_remote() asserts that
> > the mmap lock is held, which is no longer the case here with your
> > patch, but untagged_addr() seems wrong here, since we can be operating
> > on another process. I think especially on X86 with 5-level paging and
> > LAM, there can probably be cases where address bits are used for part
> > of the virtual address in one task while they need to be masked off in
> > another task?
> >
> > I wonder if you'll have to refactor X86 and Risc-V first to make this
> > work... ideally by making sure that their address tagging state
> > updates are atomic and untagged_area_remote() works locklessly.
>
> If possible, can we try to avoid this at least for this stage? We all
> agree that
> a per-VMA lock for DONTNEED is long overdue. The main goal of the patch
> is to drop the mmap_lock for high-frequency madvise operations like
> MADV_DONTNEED and potentially MADV_FREE. For these two cases, it's highly
> unlikely that one process would be managing the memory of another. In v2,
> we're modifying common code, which is why we ended up here.
>
> We could consider doing:
>
> if (current->mm == mm)
>        untagged_addr(start);
> else
>        untagged_addr_remote(mm, start);

Ah, in other words, basically make it so that for now we don't use VMA
locking on remote processes, and then we can have two different paths
here for "local operation" and "MM-locked operation"? That's not very
pretty but from a correctness perspective I'm fine with that.


^ permalink raw reply	[flat|nested] 35+ messages in thread

* Re: [PATCH RFC v2] mm: use per_vma lock for MADV_DONTNEED
  2025-05-30 20:17     ` Barry Song
@ 2025-06-02 17:35       ` SeongJae Park
  2025-06-02 17:53         ` SeongJae Park
  0 siblings, 1 reply; 35+ messages in thread
From: SeongJae Park @ 2025-06-02 17:35 UTC (permalink / raw)
  To: Barry Song
  Cc: SeongJae Park, Lorenzo Stoakes, Jann Horn, akpm, linux-mm,
	linux-kernel, Barry Song, Liam R. Howlett, David Hildenbrand,
	Vlastimil Babka, Suren Baghdasaryan, Lokesh Gidra, Tangquan Zheng

On Sat, 31 May 2025 04:17:51 +0800 Barry Song <21cnbao@gmail.com> wrote:
[...]
> BTW, I found vector_madvise doesn't check the ret value of madvise_lock(),
> it seems also a bug?
> 
> static ssize_t vector_madvise(struct mm_struct *mm, struct iov_iter *iter,
> {
>                         /* Drop and reacquire lock to unwind race. */
>                         madvise_finish_tlb(&madv_behavior);
>                        madvise_unlock(mm, behavior);
>                        madvise_lock(mm, behavior);  /* missing the ret check */
>                         madvise_init_tlb(&madv_behavior, mm)
> }

Thank you for finding this, I will post a fix soon.


Thanks,
SJ

[...]


^ permalink raw reply	[flat|nested] 35+ messages in thread

* Re: [PATCH RFC v2] mm: use per_vma lock for MADV_DONTNEED
  2025-06-02 17:35       ` SeongJae Park
@ 2025-06-02 17:53         ` SeongJae Park
  0 siblings, 0 replies; 35+ messages in thread
From: SeongJae Park @ 2025-06-02 17:53 UTC (permalink / raw)
  To: SeongJae Park
  Cc: Barry Song, Lorenzo Stoakes, Jann Horn, akpm, linux-mm,
	linux-kernel, Barry Song, Liam R. Howlett, David Hildenbrand,
	Vlastimil Babka, Suren Baghdasaryan, Lokesh Gidra, Tangquan Zheng

On Mon,  2 Jun 2025 10:35:19 -0700 SeongJae Park <sj@kernel.org> wrote:

> On Sat, 31 May 2025 04:17:51 +0800 Barry Song <21cnbao@gmail.com> wrote:
> [...]
> > BTW, I found vector_madvise doesn't check the ret value of madvise_lock(),
> > it seems also a bug?
> > 
> > static ssize_t vector_madvise(struct mm_struct *mm, struct iov_iter *iter,
> > {
> >                         /* Drop and reacquire lock to unwind race. */
> >                         madvise_finish_tlb(&madv_behavior);
> >                        madvise_unlock(mm, behavior);
> >                        madvise_lock(mm, behavior);  /* missing the ret check */
> >                         madvise_init_tlb(&madv_behavior, mm)
> > }
> 
> Thank you for finding this, I will post a fix soon.

The fix is now available at
https://lore.kernel.org/20250602174926.1074-1-sj@kernel.org


Thanks,
SJ

[...]


^ permalink raw reply	[flat|nested] 35+ messages in thread

* Re: [PATCH RFC v2] mm: use per_vma lock for MADV_DONTNEED
  2025-06-02 11:50       ` Lorenzo Stoakes
@ 2025-06-03  1:06         ` Barry Song
  2025-06-03  9:48           ` Lorenzo Stoakes
  0 siblings, 1 reply; 35+ messages in thread
From: Barry Song @ 2025-06-03  1:06 UTC (permalink / raw)
  To: Lorenzo Stoakes
  Cc: Jann Horn, akpm, linux-mm, linux-kernel, Barry Song,
	Liam R. Howlett, David Hildenbrand, Vlastimil Babka,
	Suren Baghdasaryan, Lokesh Gidra, Tangquan Zheng

Hi Lorenzo,

On Mon, Jun 2, 2025 at 11:50 PM Lorenzo Stoakes
<lorenzo.stoakes@oracle.com> wrote:
>
> Barry - would you mind if I went off and wrote a quick patch to separate
> walk_page_range_novma() into walk_page_range_kernel() and
> walk_page_range_user_novma()?
>
> I realise this is a pain, but I feel strongly that having them together is
> a source of confusion and having us special case a -only used in ptdump-
> case like this is not great.
>
> It wouldn't include any VMA locking stuff from this patch, which you could
> then rebase on that.
>
> I think it'd make sense as a separate series and I can throw that out
> fairly quickly...
>
> But I don't want to step on any toes so just let me know!

Feel free to proceed with the patch. I can rebase on top of your
walk_page_range_kernel() changes.

Thanks
Barry


^ permalink raw reply	[flat|nested] 35+ messages in thread

* Re: [PATCH RFC v2] mm: use per_vma lock for MADV_DONTNEED
  2025-05-30 20:40     ` Jann Horn
  2025-06-02 11:50       ` Lorenzo Stoakes
@ 2025-06-03  7:06       ` Barry Song
  2025-06-03 16:52         ` Jann Horn
  1 sibling, 1 reply; 35+ messages in thread
From: Barry Song @ 2025-06-03  7:06 UTC (permalink / raw)
  To: Jann Horn
  Cc: Lorenzo Stoakes, akpm, linux-mm, linux-kernel, Barry Song,
	Liam R. Howlett, David Hildenbrand, Vlastimil Babka,
	Suren Baghdasaryan, Lokesh Gidra, Tangquan Zheng

On Sat, May 31, 2025 at 8:41 AM Jann Horn <jannh@google.com> wrote:
>
> On Fri, May 30, 2025 at 4:34 PM Lorenzo Stoakes
> <lorenzo.stoakes@oracle.com> wrote:
> > Barry - I was going to come back to this later, but Jann's sort of bumped
> > this in my inbox.
> >
> > This implementation isn't quite what I was after, would you give me a
> > little bit before a respin so I can have a think about this and make
> > sensible suggestions?
> >
> > Thanks!
> >
> > On Fri, May 30, 2025 at 04:06:30PM +0200, Jann Horn wrote:
> > > On Fri, May 30, 2025 at 12:44 PM Barry Song <21cnbao@gmail.com> wrote:
> > > One important quirk of this is that it can, from what I can see, cause
> > > freeing of page tables (through pt_reclaim) without holding the mmap
> > > lock at all:
> > >
> > > do_madvise [behavior=MADV_DONTNEED]
> > >   madvise_lock
> > >     lock_vma_under_rcu
> > >   madvise_do_behavior
> > >     madvise_single_locked_vma
> > >       madvise_vma_behavior
> > >         madvise_dontneed_free
> > >           madvise_dontneed_single_vma
> > >             zap_page_range_single_batched [.reclaim_pt = true]
> > >               unmap_single_vma
> > >                 unmap_page_range
> > >                   zap_p4d_range
> > >                     zap_pud_range
> > >                       zap_pmd_range
> > >                         zap_pte_range
> > >                           try_get_and_clear_pmd
> > >                           free_pte
> > >
> > > This clashes with the assumption in walk_page_range_novma() that
> > > holding the mmap lock in write mode is sufficient to prevent
> > > concurrent page table freeing, so it can probably lead to page table
> > > UAF through the ptdump interface (see ptdump_walk_pgd()).
> >
> > Hmmmmmm is this because of the series that allows page table freeing on
> > zap... I think Zi's?
>
> Yeah, that was Qi Zheng's
> https://lore.kernel.org/all/92aba2b319a734913f18ba41e7d86a265f0b84e2.1733305182.git.zhengqi.arch@bytedance.com/
> .
>
> > We need to update the documentation on this then... which currently states
> > the VMA need only be stable.
> >
> > I guess this is still the case except for the novma walker you mention.
> >
> > Relatedly, It's worth looking at Dev's series which introduces a concerning
> > new 'no lock at all' mode to the page table walker explicitly for novma. I
> > cc'd you :) See [0].
> >
> > [0]: https://lore.kernel.org/linux-mm/6a60c052-9935-489e-a38e-1b03a1a79155@lucifer.local/
>
> Yeah, I saw that you CC'ed me; at a first glance that seems relatively
> innocuous to me as long as it's only done for kernel mappings where
> all the rules are different.
>
> >
> > >
> > > I think before this patch can land, you'll have to introduce some new
> > > helper like:
> > >
> > > void mmap_write_lock_with_all_vmas(struct mm_struct *mm)
> > > {
> > >   mmap_write_lock(mm);
> > >   for_each_vma(vmi, vma)
> > >     vma_start_write(vma);
> > > }
> > >
> > > and use that in walk_page_range_novma() for user virtual address space
> > > walks, and update the comment in there.
> >
> > What dude? No, what? Marking literally all VMAs write locked? :/
> >
> > I think this could have unexpected impact no? We're basically disabling VMA
> > locking when we're in novma, that seems... really silly?
>
> I mean, walk_page_range_novma() being used on user virtual address
> space is pretty much a debug-only thing, I don't think it matters if
> it has to spend time poking flags in a few thousand VMAs. I guess the
> alternative would be to say "ptdump just doesn't show entries between
> VMAs, which shouldn't exist in the first place", and change ptdump to
> do a normal walk that skips over userspace areas not covered by a VMA.
> Maybe that's cleaner.
>
> But FWIW, we already do worse than what I proposed here when
> installing MMU notifiers, with mm_take_all_locks().
>
> > > > +       else
> > > > +               __madvise_unlock(mm, madv_behavior->behavior);
> > > > +}
> > > > +
> > > >  static bool madvise_batch_tlb_flush(int behavior)
> > > >  {
> > > >         switch (behavior) {
> > > > @@ -1714,19 +1770,24 @@ static int madvise_do_behavior(struct mm_struct *mm,
> > > >                 unsigned long start, size_t len_in,
> > > >                 struct madvise_behavior *madv_behavior)
> > > >  {
> > > > +       struct vm_area_struct *vma = madv_behavior->vma;
> > > >         int behavior = madv_behavior->behavior;
> > > > +
> > > >         struct blk_plug plug;
> > > >         unsigned long end;
> > > >         int error;
> > > >
> > > >         if (is_memory_failure(behavior))
> > > >                 return madvise_inject_error(behavior, start, start + len_in);
> > > > -       start = untagged_addr_remote(mm, start);
> > > > +       start = untagged_addr(start);
> > >
> > > Why is this okay? I see that X86's untagged_addr_remote() asserts that
> > > the mmap lock is held, which is no longer the case here with your
> > > patch, but untagged_addr() seems wrong here, since we can be operating
> > > on another process. I think especially on X86 with 5-level paging and
> > > LAM, there can probably be cases where address bits are used for part
> > > of the virtual address in one task while they need to be masked off in
> > > another task?
> > >
> > > I wonder if you'll have to refactor X86 and Risc-V first to make this
> > > work... ideally by making sure that their address tagging state
> > > updates are atomic and untagged_area_remote() works locklessly.
> >
> > Yeah I don't know why we're doing this at all? This seems new unless I
> > missed it?
>
> Because untagged_addr_remote() has a mmap_assert_locked(mm) on x86 and
> reads data that is updated under the mmap lock, I think? So without
> this change you should get a lockdep splat on x86.
>
> > > (Or you could try to use something like the
> > > mmap_write_lock_with_all_vmas() I proposed above for synchronizing
> > > against untagged_addr(), first write-lock the MM and then write-lock
> > > all VMAs in it...)
> >
> > This would completely eliminate the point of this patch no? The whole point
> > is not taking these locks... And I'm very much not in favour of
> > write-locking literally every single VMA. under any circumstances.
>
> I'm talking about doing this heavyweight locking in places like
> arch_prctl(ARCH_ENABLE_TAGGED_ADDR, ...) that can, if I understand
> correctly, essentially reconfigure the size of the virtual address
> space of a running process from 56-bit to 47-bit at the hardware level
> and cause address bits that were previously part of the virtual
> address to be ignored. READ_ONCE()/WRITE_ONCE() might do the job too,
> but then we'll have to keep in mind that two subsequent invocations of
> untagged_addr() can translate a userspace-specified virtual address
> into two different virtual addresses at the page table level.

I’m confused about how arch_prctl(ARCH_ENABLE_TAGGED_ADDR, ...) can
reconfigure a running process from using 56-bit addresses to 47-bit.
I read the code and see the x86 kernel only supports LAM U57, and not
LAM U48 at all:

static int prctl_enable_tagged_addr(struct mm_struct *mm, unsigned long nr_bits)
{
        ...

        if (!nr_bits || nr_bits > LAM_U57_BITS) {
                mmap_write_unlock(mm);
                return -EINVAL;
        }

        mm_enable_lam(mm);
        mmap_write_unlock(mm);

        return 0;
}

I still don't fully understand why x86 differs from ARM64,
where the same bit mask is always applied unconditionally.

On ARM64, we can even enable or disable PROT_MTE on a per-VMA basis
using mmap or mprotect. However, the same bitmask operation is
always executed regardless of whether memory tags are present for a
given VMA.

I mean, on arm64, if a process or a VMA doesn't have tag access
enabled, and we pass an address with high bits to madvise,
untagged_addr() will still strip the tag. But wouldn't that address
be invalid for a process or VMA that doesn't have TBI enabled?

Thanks
Barry


^ permalink raw reply	[flat|nested] 35+ messages in thread

* Re: [PATCH RFC v2] mm: use per_vma lock for MADV_DONTNEED
  2025-05-30 14:06 ` Jann Horn
  2025-05-30 14:34   ` Lorenzo Stoakes
  2025-05-30 22:00   ` Barry Song
@ 2025-06-03  7:24   ` Qi Zheng
  2025-06-03  9:54     ` Lorenzo Stoakes
  2 siblings, 1 reply; 35+ messages in thread
From: Qi Zheng @ 2025-06-03  7:24 UTC (permalink / raw)
  To: Jann Horn, Barry Song
  Cc: akpm, linux-mm, linux-kernel, Barry Song, Liam R. Howlett,
	Lorenzo Stoakes, David Hildenbrand, Vlastimil Babka,
	Suren Baghdasaryan, Lokesh Gidra, Tangquan Zheng

Hi Jann,

On 5/30/25 10:06 PM, Jann Horn wrote:
> On Fri, May 30, 2025 at 12:44 PM Barry Song <21cnbao@gmail.com> wrote:
>> Certain madvise operations, especially MADV_DONTNEED, occur far more
>> frequently than other madvise options, particularly in native and Java
>> heaps for dynamic memory management.
>>
>> Currently, the mmap_lock is always held during these operations, even when
>> unnecessary. This causes lock contention and can lead to severe priority
>> inversion, where low-priority threads—such as Android's HeapTaskDaemon—
>> hold the lock and block higher-priority threads.
>>
>> This patch enables the use of per-VMA locks when the advised range lies
>> entirely within a single VMA, avoiding the need for full VMA traversal. In
>> practice, userspace heaps rarely issue MADV_DONTNEED across multiple VMAs.
>>
>> Tangquan’s testing shows that over 99.5% of memory reclaimed by Android
>> benefits from this per-VMA lock optimization. After extended runtime,
>> 217,735 madvise calls from HeapTaskDaemon used the per-VMA path, while
>> only 1,231 fell back to mmap_lock.
>>
>> To simplify handling, the implementation falls back to the standard
>> mmap_lock if userfaultfd is enabled on the VMA, avoiding the complexity of
>> userfaultfd_remove().
> 
> One important quirk of this is that it can, from what I can see, cause
> freeing of page tables (through pt_reclaim) without holding the mmap
> lock at all:
> 
> do_madvise [behavior=MADV_DONTNEED]
>    madvise_lock
>      lock_vma_under_rcu
>    madvise_do_behavior
>      madvise_single_locked_vma
>        madvise_vma_behavior
>          madvise_dontneed_free
>            madvise_dontneed_single_vma
>              zap_page_range_single_batched [.reclaim_pt = true]
>                unmap_single_vma
>                  unmap_page_range
>                    zap_p4d_range
>                      zap_pud_range
>                        zap_pmd_range
>                          zap_pte_range
>                            try_get_and_clear_pmd
>                            free_pte
> 
> This clashes with the assumption in walk_page_range_novma() that
> holding the mmap lock in write mode is sufficient to prevent
> concurrent page table freeing, so it can probably lead to page table
> UAF through the ptdump interface (see ptdump_walk_pgd()).

Maybe not? The PTE page is freed via RCU in zap_pte_range(), so in the
following case:

cpu 0				cpu 1

ptdump_walk_pgd
--> walk_pte_range
     --> pte_offset_map (hold RCU read lock)
				zap_pte_range
				--> free_pte (via RCU)
         walk_pte_range_inner
         --> ptdump_pte_entry (the PTE page is not freed at this time)

IIUC, there is no UAF issue here?

If I missed anything please let me know.

Thanks,
Qi




^ permalink raw reply	[flat|nested] 35+ messages in thread

* Re: [PATCH RFC v2] mm: use per_vma lock for MADV_DONTNEED
  2025-06-02 14:55     ` Jann Horn
@ 2025-06-03  7:51       ` Barry Song
  0 siblings, 0 replies; 35+ messages in thread
From: Barry Song @ 2025-06-03  7:51 UTC (permalink / raw)
  To: Jann Horn
  Cc: akpm, linux-mm, linux-kernel, Barry Song, Liam R. Howlett,
	Lorenzo Stoakes, David Hildenbrand, Vlastimil Babka,
	Suren Baghdasaryan, Lokesh Gidra, Tangquan Zheng

On Tue, Jun 3, 2025 at 2:56 AM Jann Horn <jannh@google.com> wrote:
>
> On Sat, May 31, 2025 at 12:00 AM Barry Song <21cnbao@gmail.com> wrote:
> > On Fri, May 30, 2025 at 10:07 PM Jann Horn <jannh@google.com> wrote:
> > > On Fri, May 30, 2025 at 12:44 PM Barry Song <21cnbao@gmail.com> wrote:
> > > > @@ -1714,19 +1770,24 @@ static int madvise_do_behavior(struct mm_struct *mm,
> > > >                 unsigned long start, size_t len_in,
> > > >                 struct madvise_behavior *madv_behavior)
> > > >  {
> > > > +       struct vm_area_struct *vma = madv_behavior->vma;
> > > >         int behavior = madv_behavior->behavior;
> > > > +
> > > >         struct blk_plug plug;
> > > >         unsigned long end;
> > > >         int error;
> > > >
> > > >         if (is_memory_failure(behavior))
> > > >                 return madvise_inject_error(behavior, start, start + len_in);
> > > > -       start = untagged_addr_remote(mm, start);
> > > > +       start = untagged_addr(start);
> > >
> > > Why is this okay? I see that X86's untagged_addr_remote() asserts that
> > > the mmap lock is held, which is no longer the case here with your
> > > patch, but untagged_addr() seems wrong here, since we can be operating
> > > on another process. I think especially on X86 with 5-level paging and
> > > LAM, there can probably be cases where address bits are used for part
> > > of the virtual address in one task while they need to be masked off in
> > > another task?
> > >
> > > I wonder if you'll have to refactor X86 and Risc-V first to make this
> > > work... ideally by making sure that their address tagging state
> > > updates are atomic and untagged_area_remote() works locklessly.
> >
> > If possible, can we try to avoid this at least for this stage? We all
> > agree that
> > a per-VMA lock for DONTNEED is long overdue. The main goal of the patch
> > is to drop the mmap_lock for high-frequency madvise operations like
> > MADV_DONTNEED and potentially MADV_FREE. For these two cases, it's highly
> > unlikely that one process would be managing the memory of another. In v2,
> > we're modifying common code, which is why we ended up here.
> >
> > We could consider doing:
> >
> > if (current->mm == mm)
> >        untagged_addr(start);
> > else
> >        untagged_addr_remote(mm, start);
>
> Ah, in other words, basically make it so that for now we don't use VMA
> locking on remote processes, and then we can have two different paths
> here for "local operation" and "MM-locked operation"? That's not very
> pretty but from a correctness perspective I'm fine with that.

Right, that’s exactly what I mean—we may hold off on remote `madvise` for
now unless we can find a straightforward way to fix up the architecture
code, especially since the tagging implementations in x86 and RISC-V are
quite confusing.

It’s particularly tricky for RISC-V, which supports two different PMLEN
values simultaneously. Resolving the untagging issue will likely require
extensive discussions with both the x86 and RISC-V architecture teams.

long set_tagged_addr_ctrl(struct task_struct *task, unsigned long arg)
{
        ...
        /*
         * Prefer the smallest PMLEN that satisfies the user's request,
         * in case choosing a larger PMLEN has a performance impact.
         */
        pmlen = FIELD_GET(PR_PMLEN_MASK, arg);
        if (pmlen == PMLEN_0) {
                pmm = ENVCFG_PMM_PMLEN_0;
        } else if (pmlen <= PMLEN_7 && have_user_pmlen_7) {
                pmlen = PMLEN_7;
                pmm = ENVCFG_PMM_PMLEN_7;
        } else if (pmlen <= PMLEN_16 && have_user_pmlen_16) {
                pmlen = PMLEN_16;
                pmm = ENVCFG_PMM_PMLEN_16;
        } else {
                return -EINVAL;
        }
        ...
}

It’s strange that x86’s LAM U48 was rejected, while RISC-V’s PMLEN values
of 7 and 16 were both accepted.

Another reason is that we’re not too concerned about remote `madvise`
at this stage, as our current focus is on high-frequency cases like
`MADV_DONTNEED`, and possibly `MADV_FREE`.

Thanks
Barry


^ permalink raw reply	[flat|nested] 35+ messages in thread

* Re: [PATCH RFC v2] mm: use per_vma lock for MADV_DONTNEED
  2025-06-03  1:06         ` Barry Song
@ 2025-06-03  9:48           ` Lorenzo Stoakes
  0 siblings, 0 replies; 35+ messages in thread
From: Lorenzo Stoakes @ 2025-06-03  9:48 UTC (permalink / raw)
  To: Barry Song
  Cc: Jann Horn, akpm, linux-mm, linux-kernel, Barry Song,
	Liam R. Howlett, David Hildenbrand, Vlastimil Babka,
	Suren Baghdasaryan, Lokesh Gidra, Tangquan Zheng

On Tue, Jun 03, 2025 at 01:06:15PM +1200, Barry Song wrote:
> Hi Lorenzo,
>
> On Mon, Jun 2, 2025 at 11:50 PM Lorenzo Stoakes
> <lorenzo.stoakes@oracle.com> wrote:
> >
> > Barry - would you mind if I went off and wrote a quick patch to separate
> > walk_page_range_novma() into walk_page_range_kernel() and
> > walk_page_range_user_novma()?
> >
> > I realise this is a pain, but I feel strongly that having them together is
> > a source of confusion and having us special case a -only used in ptdump-
> > case like this is not great.
> >
> > It wouldn't include any VMA locking stuff from this patch, which you could
> > then rebase on that.
> >
> > I think it'd make sense as a separate series and I can throw that out
> > fairly quickly...
> >
> > But I don't want to step on any toes so just let me know!
>
> Feel free to proceed with the patch. I can rebase on top of your
> walk_page_range_kernel() changes.

Thanks! Will send that out hopefully today.

I will also come back about how I feel the abstraction here should work, sorry
getting tied up with other stuff so got delayed on that...!

>
> Thanks
> Barry


^ permalink raw reply	[flat|nested] 35+ messages in thread

* Re: [PATCH RFC v2] mm: use per_vma lock for MADV_DONTNEED
  2025-06-03  7:24   ` Qi Zheng
@ 2025-06-03  9:54     ` Lorenzo Stoakes
  2025-06-04  6:02       ` Qi Zheng
  0 siblings, 1 reply; 35+ messages in thread
From: Lorenzo Stoakes @ 2025-06-03  9:54 UTC (permalink / raw)
  To: Qi Zheng
  Cc: Jann Horn, Barry Song, akpm, linux-mm, linux-kernel, Barry Song,
	Liam R. Howlett, David Hildenbrand, Vlastimil Babka,
	Suren Baghdasaryan, Lokesh Gidra, Tangquan Zheng

On Tue, Jun 03, 2025 at 03:24:28PM +0800, Qi Zheng wrote:
> Hi Jann,
>
> On 5/30/25 10:06 PM, Jann Horn wrote:
> > On Fri, May 30, 2025 at 12:44 PM Barry Song <21cnbao@gmail.com> wrote:
> > > Certain madvise operations, especially MADV_DONTNEED, occur far more
> > > frequently than other madvise options, particularly in native and Java
> > > heaps for dynamic memory management.
> > >
> > > Currently, the mmap_lock is always held during these operations, even when
> > > unnecessary. This causes lock contention and can lead to severe priority
> > > inversion, where low-priority threads—such as Android's HeapTaskDaemon—
> > > hold the lock and block higher-priority threads.
> > >
> > > This patch enables the use of per-VMA locks when the advised range lies
> > > entirely within a single VMA, avoiding the need for full VMA traversal. In
> > > practice, userspace heaps rarely issue MADV_DONTNEED across multiple VMAs.
> > >
> > > Tangquan’s testing shows that over 99.5% of memory reclaimed by Android
> > > benefits from this per-VMA lock optimization. After extended runtime,
> > > 217,735 madvise calls from HeapTaskDaemon used the per-VMA path, while
> > > only 1,231 fell back to mmap_lock.
> > >
> > > To simplify handling, the implementation falls back to the standard
> > > mmap_lock if userfaultfd is enabled on the VMA, avoiding the complexity of
> > > userfaultfd_remove().
> >
> > One important quirk of this is that it can, from what I can see, cause
> > freeing of page tables (through pt_reclaim) without holding the mmap
> > lock at all:
> >
> > do_madvise [behavior=MADV_DONTNEED]
> >    madvise_lock
> >      lock_vma_under_rcu
> >    madvise_do_behavior
> >      madvise_single_locked_vma
> >        madvise_vma_behavior
> >          madvise_dontneed_free
> >            madvise_dontneed_single_vma
> >              zap_page_range_single_batched [.reclaim_pt = true]
> >                unmap_single_vma
> >                  unmap_page_range
> >                    zap_p4d_range
> >                      zap_pud_range
> >                        zap_pmd_range
> >                          zap_pte_range
> >                            try_get_and_clear_pmd
> >                            free_pte
> >
> > This clashes with the assumption in walk_page_range_novma() that
> > holding the mmap lock in write mode is sufficient to prevent
> > concurrent page table freeing, so it can probably lead to page table
> > UAF through the ptdump interface (see ptdump_walk_pgd()).
>
> Maybe not? The PTE page is freed via RCU in zap_pte_range(), so in the
> following case:
>
> cpu 0				cpu 1
>
> ptdump_walk_pgd
> --> walk_pte_range
>     --> pte_offset_map (hold RCU read lock)
> 				zap_pte_range
> 				--> free_pte (via RCU)
>         walk_pte_range_inner
>         --> ptdump_pte_entry (the PTE page is not freed at this time)
>
> IIUC, there is no UAF issue here?
>
> If I missed anything please let me know.
>
> Thanks,
> Qi
>
>

I forgot about that interesting placement of RCU lock acquisition :) I will
obviously let Jann come back to you on this, but I wonder if I need to
update the doc to reflect this actually.


^ permalink raw reply	[flat|nested] 35+ messages in thread

* Re: [PATCH RFC v2] mm: use per_vma lock for MADV_DONTNEED
  2025-06-03  7:06       ` Barry Song
@ 2025-06-03 16:52         ` Jann Horn
  2025-06-05 10:27           ` Barry Song
  0 siblings, 1 reply; 35+ messages in thread
From: Jann Horn @ 2025-06-03 16:52 UTC (permalink / raw)
  To: Barry Song
  Cc: Lorenzo Stoakes, akpm, linux-mm, linux-kernel, Barry Song,
	Liam R. Howlett, David Hildenbrand, Vlastimil Babka,
	Suren Baghdasaryan, Lokesh Gidra, Tangquan Zheng

On Tue, Jun 3, 2025 at 9:06 AM Barry Song <21cnbao@gmail.com> wrote:
> On Sat, May 31, 2025 at 8:41 AM Jann Horn <jannh@google.com> wrote:
> > On Fri, May 30, 2025 at 4:34 PM Lorenzo Stoakes
> > <lorenzo.stoakes@oracle.com> wrote:
> > > On Fri, May 30, 2025 at 04:06:30PM +0200, Jann Horn wrote:
> > > > On Fri, May 30, 2025 at 12:44 PM Barry Song <21cnbao@gmail.com> wrote:
> > > > One important quirk of this is that it can, from what I can see, cause
> > > > freeing of page tables (through pt_reclaim) without holding the mmap
> > > > lock at all:
> > > >
> > > > do_madvise [behavior=MADV_DONTNEED]
> > > >   madvise_lock
> > > >     lock_vma_under_rcu
> > > >   madvise_do_behavior
> > > >     madvise_single_locked_vma
> > > >       madvise_vma_behavior
> > > >         madvise_dontneed_free
> > > >           madvise_dontneed_single_vma
> > > >             zap_page_range_single_batched [.reclaim_pt = true]
> > > >               unmap_single_vma
> > > >                 unmap_page_range
> > > >                   zap_p4d_range
> > > >                     zap_pud_range
> > > >                       zap_pmd_range
> > > >                         zap_pte_range
> > > >                           try_get_and_clear_pmd
> > > >                           free_pte
> > > >
> > > > This clashes with the assumption in walk_page_range_novma() that
> > > > holding the mmap lock in write mode is sufficient to prevent
> > > > concurrent page table freeing, so it can probably lead to page table
> > > > UAF through the ptdump interface (see ptdump_walk_pgd()).
> > >
> > > Hmmmmmm is this because of the series that allows page table freeing on
> > > zap... I think Zi's?
> >
> > Yeah, that was Qi Zheng's
> > https://lore.kernel.org/all/92aba2b319a734913f18ba41e7d86a265f0b84e2.1733305182.git.zhengqi.arch@bytedance.com/
> > .
> >
> > > We need to update the documentation on this then... which currently states
> > > the VMA need only be stable.
> > >
> > > I guess this is still the case except for the novma walker you mention.
> > >
> > > Relatedly, It's worth looking at Dev's series which introduces a concerning
> > > new 'no lock at all' mode to the page table walker explicitly for novma. I
> > > cc'd you :) See [0].
> > >
> > > [0]: https://lore.kernel.org/linux-mm/6a60c052-9935-489e-a38e-1b03a1a79155@lucifer.local/
> >
> > Yeah, I saw that you CC'ed me; at a first glance that seems relatively
> > innocuous to me as long as it's only done for kernel mappings where
> > all the rules are different.
> >
> > >
> > > >
> > > > I think before this patch can land, you'll have to introduce some new
> > > > helper like:
> > > >
> > > > void mmap_write_lock_with_all_vmas(struct mm_struct *mm)
> > > > {
> > > >   mmap_write_lock(mm);
> > > >   for_each_vma(vmi, vma)
> > > >     vma_start_write(vma);
> > > > }
> > > >
> > > > and use that in walk_page_range_novma() for user virtual address space
> > > > walks, and update the comment in there.
> > >
> > > What dude? No, what? Marking literally all VMAs write locked? :/
> > >
> > > I think this could have unexpected impact no? We're basically disabling VMA
> > > locking when we're in novma, that seems... really silly?
> >
> > I mean, walk_page_range_novma() being used on user virtual address
> > space is pretty much a debug-only thing, I don't think it matters if
> > it has to spend time poking flags in a few thousand VMAs. I guess the
> > alternative would be to say "ptdump just doesn't show entries between
> > VMAs, which shouldn't exist in the first place", and change ptdump to
> > do a normal walk that skips over userspace areas not covered by a VMA.
> > Maybe that's cleaner.
> >
> > But FWIW, we already do worse than what I proposed here when
> > installing MMU notifiers, with mm_take_all_locks().
> >
> > > > > +       else
> > > > > +               __madvise_unlock(mm, madv_behavior->behavior);
> > > > > +}
> > > > > +
> > > > >  static bool madvise_batch_tlb_flush(int behavior)
> > > > >  {
> > > > >         switch (behavior) {
> > > > > @@ -1714,19 +1770,24 @@ static int madvise_do_behavior(struct mm_struct *mm,
> > > > >                 unsigned long start, size_t len_in,
> > > > >                 struct madvise_behavior *madv_behavior)
> > > > >  {
> > > > > +       struct vm_area_struct *vma = madv_behavior->vma;
> > > > >         int behavior = madv_behavior->behavior;
> > > > > +
> > > > >         struct blk_plug plug;
> > > > >         unsigned long end;
> > > > >         int error;
> > > > >
> > > > >         if (is_memory_failure(behavior))
> > > > >                 return madvise_inject_error(behavior, start, start + len_in);
> > > > > -       start = untagged_addr_remote(mm, start);
> > > > > +       start = untagged_addr(start);
> > > >
> > > > Why is this okay? I see that X86's untagged_addr_remote() asserts that
> > > > the mmap lock is held, which is no longer the case here with your
> > > > patch, but untagged_addr() seems wrong here, since we can be operating
> > > > on another process. I think especially on X86 with 5-level paging and
> > > > LAM, there can probably be cases where address bits are used for part
> > > > of the virtual address in one task while they need to be masked off in
> > > > another task?
> > > >
> > > > I wonder if you'll have to refactor X86 and Risc-V first to make this
> > > > work... ideally by making sure that their address tagging state
> > > > updates are atomic and untagged_area_remote() works locklessly.
> > >
> > > Yeah I don't know why we're doing this at all? This seems new unless I
> > > missed it?
> >
> > Because untagged_addr_remote() has a mmap_assert_locked(mm) on x86 and
> > reads data that is updated under the mmap lock, I think? So without
> > this change you should get a lockdep splat on x86.
> >
> > > > (Or you could try to use something like the
> > > > mmap_write_lock_with_all_vmas() I proposed above for synchronizing
> > > > against untagged_addr(), first write-lock the MM and then write-lock
> > > > all VMAs in it...)
> > >
> > > This would completely eliminate the point of this patch no? The whole point
> > > is not taking these locks... And I'm very much not in favour of
> > > write-locking literally every single VMA. under any circumstances.
> >
> > I'm talking about doing this heavyweight locking in places like
> > arch_prctl(ARCH_ENABLE_TAGGED_ADDR, ...) that can, if I understand
> > correctly, essentially reconfigure the size of the virtual address
> > space of a running process from 56-bit to 47-bit at the hardware level
> > and cause address bits that were previously part of the virtual
> > address to be ignored. READ_ONCE()/WRITE_ONCE() might do the job too,
> > but then we'll have to keep in mind that two subsequent invocations of
> > untagged_addr() can translate a userspace-specified virtual address
> > into two different virtual addresses at the page table level.
>
> I’m confused about how arch_prctl(ARCH_ENABLE_TAGGED_ADDR, ...) can
> reconfigure a running process from using 56-bit addresses to 47-bit.
> I read the code and see the x86 kernel only supports LAM U57, and not
> LAM U48 at all:
>
> static int prctl_enable_tagged_addr(struct mm_struct *mm, unsigned long nr_bits)
> {
>         ...
>
>         if (!nr_bits || nr_bits > LAM_U57_BITS) {
>                 mmap_write_unlock(mm);
>                 return -EINVAL;
>         }
>
>         mm_enable_lam(mm);
>         mmap_write_unlock(mm);
>
>         return 0;
> }

Oh, you're right, currently only LAM U57 is supported by Linux, I was
making bad assumptions.

commit 2f8794bd087e, which introduced that code, also mentions that
"For now only LAM_U57 is supported, with 6 tag bits".

> I still don't fully understand why x86 differs from ARM64,
> where the same bit mask is always applied unconditionally.
>
> On ARM64, we can even enable or disable PROT_MTE on a per-VMA basis
> using mmap or mprotect. However, the same bitmask operation is
> always executed regardless of whether memory tags are present for a
> given VMA.

Hmm, true, that does look like a weird difference.

> I mean, on arm64, if a process or a VMA doesn't have tag access
> enabled, and we pass an address with high bits to madvise,
> untagged_addr() will still strip the tag. But wouldn't that address
> be invalid for a process or VMA that doesn't have TBI enabled?

Yeah, I guess in that regard it might be fine to always strip the bits...

Maybe the situation on arm64 is simpler, in that these bits are always
either tag bits or unused on arm64, while my understanding is that on
X86, the CPU supports changing the meaning of address bits between
"part of virtual address" and "ignored tag bits". So I think stripping
the bits might work fine for LAM U57, but not for LAM U48, and maybe
the code is trying to be future-proof in case someone wants to add
support for LAM U48?

It is arguably also a bit more robust to reject garbage addresses
instead of ignoring bits that would cause the CPU to treat the address
as noncanonical, but I guess doing that just in MM code is not a big
problem. (Unless someone relies on seccomp filters that block
manipulation of specific virtual address ranges via munmap() and such,
but I think almost nobody does that. I think I've only seen that once
in some rarely-used component of the Tor project.)

But I am out of my depth here and might be severely misunderstanding
what's going on.


^ permalink raw reply	[flat|nested] 35+ messages in thread

* Re: [PATCH RFC v2] mm: use per_vma lock for MADV_DONTNEED
  2025-05-30 10:44 [PATCH RFC v2] mm: use per_vma lock for MADV_DONTNEED Barry Song
  2025-05-30 14:06 ` Jann Horn
@ 2025-06-03 18:43 ` Lorenzo Stoakes
  2025-06-03 20:17   ` Suren Baghdasaryan
  2025-06-03 20:59   ` Pedro Falcato
  1 sibling, 2 replies; 35+ messages in thread
From: Lorenzo Stoakes @ 2025-06-03 18:43 UTC (permalink / raw)
  To: Barry Song
  Cc: akpm, linux-mm, linux-kernel, Barry Song, Liam R. Howlett,
	David Hildenbrand, Vlastimil Babka, Jann Horn, Suren Baghdasaryan,
	Lokesh Gidra, Tangquan Zheng

Hi Barry,

As promised, I enclose a patch to give a sense of how I think we might
thread state through this operation.

There's a todo on the untagged stuff so you can figure that out. This is
based on the v1 so it might not encompass everything you addressed in the
v2.

Passing in madv_behavior to madvise_walk_vmas() twice kinda sucks, I
_despise_ the void *arg function ptr stuff there added just for the anon
vma name stuff (ughhh) so might be the only sensible way of threading
state.

I don't need any attribution, so please use this patch as you see
fit/adapt/delete/do whatever with it, just an easier way for me to show the
idea!

I did some very basic testing and it seems to work, but nothing deeper.

Cheers, Lorenzo

----8<----
From ff4ba0115cb31a0630b6f8c02c68f11b3fb71f7a Mon Sep 17 00:00:00 2001
From: Lorenzo Stoakes <lorenzo.stoakes@oracle.com>
Date: Tue, 3 Jun 2025 18:22:55 +0100
Subject: [PATCH] mm/madvise: support VMA read locks for MADV_DONTNEED[_LOCKED]

Refactor the madvise() code to retain state about the locking mode utilised
for traversing VMAs.

Then use this mechanism to permit VMA locking to be done later in the
madvise() logic and also to allow altering of the locking mode to permit
falling back to an mmap read lock if required.

Signed-off-by: Lorenzo Stoakes <lorenzo.stoakes@oracle.com>
---
 mm/madvise.c | 174 +++++++++++++++++++++++++++++++++++++--------------
 1 file changed, 127 insertions(+), 47 deletions(-)

diff --git a/mm/madvise.c b/mm/madvise.c
index 5f7a66a1617e..a3a6d73d0bd5 100644
--- a/mm/madvise.c
+++ b/mm/madvise.c
@@ -48,38 +48,19 @@ struct madvise_walk_private {
 	bool pageout;
 };

+enum madvise_lock_mode {
+	MADVISE_NO_LOCK,
+	MADVISE_MMAP_READ_LOCK,
+	MADVISE_MMAP_WRITE_LOCK,
+	MADVISE_VMA_READ_LOCK,
+};
+
 struct madvise_behavior {
 	int behavior;
 	struct mmu_gather *tlb;
+	enum madvise_lock_mode lock_mode;
 };

-/*
- * Any behaviour which results in changes to the vma->vm_flags needs to
- * take mmap_lock for writing. Others, which simply traverse vmas, need
- * to only take it for reading.
- */
-static int madvise_need_mmap_write(int behavior)
-{
-	switch (behavior) {
-	case MADV_REMOVE:
-	case MADV_WILLNEED:
-	case MADV_DONTNEED:
-	case MADV_DONTNEED_LOCKED:
-	case MADV_COLD:
-	case MADV_PAGEOUT:
-	case MADV_FREE:
-	case MADV_POPULATE_READ:
-	case MADV_POPULATE_WRITE:
-	case MADV_COLLAPSE:
-	case MADV_GUARD_INSTALL:
-	case MADV_GUARD_REMOVE:
-		return 0;
-	default:
-		/* be safe, default to 1. list exceptions explicitly */
-		return 1;
-	}
-}
-
 #ifdef CONFIG_ANON_VMA_NAME
 struct anon_vma_name *anon_vma_name_alloc(const char *name)
 {
@@ -1486,6 +1467,43 @@ static bool process_madvise_remote_valid(int behavior)
 	}
 }

+/*
+ * Try to acquire a VMA read lock if possible.
+ *
+ * We only support this lock over a single VMA, which the input range must
+ * span.either partially or fully.
+ *
+ * This function always returns with an appropriate lock held. If a VMA read
+ * lock could be acquired, we return the locked VMA.
+ *
+ * If a VMA read lock could not be acquired, we return NULL and expect caller to
+ * fallback to mmap lock behaviour.
+ */
+static struct vm_area_struct *try_vma_read_lock(struct mm_struct *mm,
+		struct madvise_behavior *madv_behavior,
+		unsigned long start, unsigned long end)
+{
+	struct vm_area_struct *vma;
+
+	if (!madv_behavior || madv_behavior->lock_mode != MADVISE_VMA_READ_LOCK)
+		return NULL;
+
+	vma = lock_vma_under_rcu(mm, start);
+	if (!vma)
+		goto take_mmap_read_lock;
+	/* We must span only a single VMA, uffd unsupported. */
+	if (end > vma->vm_end || userfaultfd_armed(vma)) {
+		vma_end_read(vma);
+		goto take_mmap_read_lock;
+	}
+	return vma;
+
+take_mmap_read_lock:
+	mmap_read_lock(mm);
+	madv_behavior->lock_mode = MADVISE_MMAP_READ_LOCK;
+	return NULL;
+}
+
 /*
  * Walk the vmas in range [start,end), and call the visit function on each one.
  * The visit function will get start and end parameters that cover the overlap
@@ -1496,7 +1514,8 @@ static bool process_madvise_remote_valid(int behavior)
  */
 static
 int madvise_walk_vmas(struct mm_struct *mm, unsigned long start,
-		      unsigned long end, void *arg,
+		      unsigned long end, struct madvise_behavior *madv_behavior,
+		      void *arg,
 		      int (*visit)(struct vm_area_struct *vma,
 				   struct vm_area_struct **prev, unsigned long start,
 				   unsigned long end, void *arg))
@@ -1505,6 +1524,15 @@ int madvise_walk_vmas(struct mm_struct *mm, unsigned long start,
 	struct vm_area_struct *prev;
 	unsigned long tmp;
 	int unmapped_error = 0;
+	int error;
+
+	/* If VMA read lock supported, we apply advice to a single VMA only. */
+	vma = try_vma_read_lock(mm, madv_behavior, start, end);
+	if (vma) {
+		error = visit(vma, &prev, start, end, arg);
+		vma_end_read(vma);
+		return error;
+	}

 	/*
 	 * If the interval [start,end) covers some unmapped address
@@ -1516,8 +1544,6 @@ int madvise_walk_vmas(struct mm_struct *mm, unsigned long start,
 		prev = vma;

 	for (;;) {
-		int error;
-
 		/* Still start < end. */
 		if (!vma)
 			return -ENOMEM;
@@ -1598,34 +1624,86 @@ int madvise_set_anon_name(struct mm_struct *mm, unsigned long start,
 	if (end == start)
 		return 0;

-	return madvise_walk_vmas(mm, start, end, anon_name,
+	return madvise_walk_vmas(mm, start, end, anon_name, NULL,
 				 madvise_vma_anon_name);
 }
 #endif /* CONFIG_ANON_VMA_NAME */

-static int madvise_lock(struct mm_struct *mm, int behavior)
+
+/*
+ * Any behaviour which results in changes to the vma->vm_flags needs to
+ * take mmap_lock for writing. Others, which simply traverse vmas, need
+ * to only take it for reading.
+ */
+static enum madvise_lock_mode get_lock_mode(struct madvise_behavior *madv_behavior)
 {
+	int behavior = madv_behavior->behavior;
+
 	if (is_memory_failure(behavior))
-		return 0;
+		return MADVISE_NO_LOCK;

-	if (madvise_need_mmap_write(behavior)) {
+	switch (behavior) {
+	case MADV_REMOVE:
+	case MADV_WILLNEED:
+	case MADV_COLD:
+	case MADV_PAGEOUT:
+	case MADV_FREE:
+	case MADV_POPULATE_READ:
+	case MADV_POPULATE_WRITE:
+	case MADV_COLLAPSE:
+	case MADV_GUARD_INSTALL:
+	case MADV_GUARD_REMOVE:
+		return MADVISE_MMAP_READ_LOCK;
+	case MADV_DONTNEED:
+	case MADV_DONTNEED_LOCKED:
+		return MADVISE_VMA_READ_LOCK;
+	default:
+		return MADVISE_MMAP_WRITE_LOCK;
+	}
+}
+
+static int madvise_lock(struct mm_struct *mm,
+		struct madvise_behavior *madv_behavior)
+{
+	enum madvise_lock_mode lock_mode = get_lock_mode(madv_behavior);
+
+	switch (lock_mode) {
+	case MADVISE_NO_LOCK:
+		break;
+	case MADVISE_MMAP_WRITE_LOCK:
 		if (mmap_write_lock_killable(mm))
 			return -EINTR;
-	} else {
+		break;
+	case MADVISE_MMAP_READ_LOCK:
 		mmap_read_lock(mm);
+		break;
+	case MADVISE_VMA_READ_LOCK:
+		/* We will acquire the lock per-VMA in madvise_walk_vmas(). */
+		break;
 	}
+
+	madv_behavior->lock_mode = lock_mode;
 	return 0;
 }

-static void madvise_unlock(struct mm_struct *mm, int behavior)
+static void madvise_unlock(struct mm_struct *mm,
+		struct madvise_behavior *madv_behavior)
 {
-	if (is_memory_failure(behavior))
+	switch (madv_behavior->lock_mode) {
+	case  MADVISE_NO_LOCK:
 		return;
-
-	if (madvise_need_mmap_write(behavior))
+	case MADVISE_MMAP_WRITE_LOCK:
 		mmap_write_unlock(mm);
-	else
+		break;
+	case MADVISE_MMAP_READ_LOCK:
 		mmap_read_unlock(mm);
+		break;
+	case MADVISE_VMA_READ_LOCK:
+		/* We will drop the lock per-VMA in madvise_walk_vmas(). */
+		break;
+	}
+
+	madv_behavior->lock_mode = MADVISE_NO_LOCK;
 }

 static bool madvise_batch_tlb_flush(int behavior)
@@ -1721,6 +1799,8 @@ static int madvise_do_behavior(struct mm_struct *mm,

 	if (is_memory_failure(behavior))
 		return madvise_inject_error(behavior, start, start + len_in);
+
+	// TODO: handle untagged stuff here...
 	start = untagged_addr(start); //untagged_addr_remote(mm, start);
 	end = start + PAGE_ALIGN(len_in);

@@ -1729,7 +1809,7 @@ static int madvise_do_behavior(struct mm_struct *mm,
 		error = madvise_populate(mm, start, end, behavior);
 	else
 		error = madvise_walk_vmas(mm, start, end, madv_behavior,
-					  madvise_vma_behavior);
+					  madv_behavior, madvise_vma_behavior);
 	blk_finish_plug(&plug);
 	return error;
 }
@@ -1817,13 +1897,13 @@ int do_madvise(struct mm_struct *mm, unsigned long start, size_t len_in, int beh

 	if (madvise_should_skip(start, len_in, behavior, &error))
 		return error;
-	error = madvise_lock(mm, behavior);
+	error = madvise_lock(mm, &madv_behavior);
 	if (error)
 		return error;
 	madvise_init_tlb(&madv_behavior, mm);
 	error = madvise_do_behavior(mm, start, len_in, &madv_behavior);
 	madvise_finish_tlb(&madv_behavior);
-	madvise_unlock(mm, behavior);
+	madvise_unlock(mm, &madv_behavior);

 	return error;
 }
@@ -1847,7 +1927,7 @@ static ssize_t vector_madvise(struct mm_struct *mm, struct iov_iter *iter,

 	total_len = iov_iter_count(iter);

-	ret = madvise_lock(mm, behavior);
+	ret = madvise_lock(mm, &madv_behavior);
 	if (ret)
 		return ret;
 	madvise_init_tlb(&madv_behavior, mm);
@@ -1880,8 +1960,8 @@ static ssize_t vector_madvise(struct mm_struct *mm, struct iov_iter *iter,

 			/* Drop and reacquire lock to unwind race. */
 			madvise_finish_tlb(&madv_behavior);
-			madvise_unlock(mm, behavior);
-			ret = madvise_lock(mm, behavior);
+			madvise_unlock(mm, &madv_behavior);
+			ret = madvise_lock(mm, &madv_behavior);
 			if (ret)
 				goto out;
 			madvise_init_tlb(&madv_behavior, mm);
@@ -1892,7 +1972,7 @@ static ssize_t vector_madvise(struct mm_struct *mm, struct iov_iter *iter,
 		iov_iter_advance(iter, iter_iov_len(iter));
 	}
 	madvise_finish_tlb(&madv_behavior);
-	madvise_unlock(mm, behavior);
+	madvise_unlock(mm, &madv_behavior);

 out:
 	ret = (total_len - iov_iter_count(iter)) ? : ret;
--
2.49.0


^ permalink raw reply related	[flat|nested] 35+ messages in thread

* Re: [PATCH RFC v2] mm: use per_vma lock for MADV_DONTNEED
  2025-06-03 18:43 ` Lorenzo Stoakes
@ 2025-06-03 20:17   ` Suren Baghdasaryan
  2025-06-04  5:22     ` Lorenzo Stoakes
  2025-06-06  7:18     ` Barry Song
  2025-06-03 20:59   ` Pedro Falcato
  1 sibling, 2 replies; 35+ messages in thread
From: Suren Baghdasaryan @ 2025-06-03 20:17 UTC (permalink / raw)
  To: Lorenzo Stoakes
  Cc: Barry Song, akpm, linux-mm, linux-kernel, Barry Song,
	Liam R. Howlett, David Hildenbrand, Vlastimil Babka, Jann Horn,
	Lokesh Gidra, Tangquan Zheng

On Tue, Jun 3, 2025 at 11:43 AM Lorenzo Stoakes
<lorenzo.stoakes@oracle.com> wrote:
>
> Hi Barry,
>
> As promised, I enclose a patch to give a sense of how I think we might
> thread state through this operation.
>
> There's a todo on the untagged stuff so you can figure that out. This is
> based on the v1 so it might not encompass everything you addressed in the
> v2.
>
> Passing in madv_behavior to madvise_walk_vmas() twice kinda sucks, I
> _despise_ the void *arg function ptr stuff there added just for the anon
> vma name stuff (ughhh) so might be the only sensible way of threading
> state.
>
> I don't need any attribution, so please use this patch as you see
> fit/adapt/delete/do whatever with it, just an easier way for me to show the
> idea!
>
> I did some very basic testing and it seems to work, but nothing deeper.
>
> Cheers, Lorenzo
>
> ----8<----
> From ff4ba0115cb31a0630b6f8c02c68f11b3fb71f7a Mon Sep 17 00:00:00 2001
> From: Lorenzo Stoakes <lorenzo.stoakes@oracle.com>
> Date: Tue, 3 Jun 2025 18:22:55 +0100
> Subject: [PATCH] mm/madvise: support VMA read locks for MADV_DONTNEED[_LOCKED]
>
> Refactor the madvise() code to retain state about the locking mode utilised
> for traversing VMAs.
>
> Then use this mechanism to permit VMA locking to be done later in the
> madvise() logic and also to allow altering of the locking mode to permit
> falling back to an mmap read lock if required.
>
> Signed-off-by: Lorenzo Stoakes <lorenzo.stoakes@oracle.com>
> ---
>  mm/madvise.c | 174 +++++++++++++++++++++++++++++++++++++--------------
>  1 file changed, 127 insertions(+), 47 deletions(-)
>
> diff --git a/mm/madvise.c b/mm/madvise.c
> index 5f7a66a1617e..a3a6d73d0bd5 100644
> --- a/mm/madvise.c
> +++ b/mm/madvise.c
> @@ -48,38 +48,19 @@ struct madvise_walk_private {
>         bool pageout;
>  };
>
> +enum madvise_lock_mode {
> +       MADVISE_NO_LOCK,
> +       MADVISE_MMAP_READ_LOCK,
> +       MADVISE_MMAP_WRITE_LOCK,
> +       MADVISE_VMA_READ_LOCK,
> +};
> +
>  struct madvise_behavior {
>         int behavior;
>         struct mmu_gather *tlb;
> +       enum madvise_lock_mode lock_mode;
>  };
>
> -/*
> - * Any behaviour which results in changes to the vma->vm_flags needs to
> - * take mmap_lock for writing. Others, which simply traverse vmas, need
> - * to only take it for reading.
> - */
> -static int madvise_need_mmap_write(int behavior)
> -{
> -       switch (behavior) {
> -       case MADV_REMOVE:
> -       case MADV_WILLNEED:
> -       case MADV_DONTNEED:
> -       case MADV_DONTNEED_LOCKED:
> -       case MADV_COLD:
> -       case MADV_PAGEOUT:
> -       case MADV_FREE:
> -       case MADV_POPULATE_READ:
> -       case MADV_POPULATE_WRITE:
> -       case MADV_COLLAPSE:
> -       case MADV_GUARD_INSTALL:
> -       case MADV_GUARD_REMOVE:
> -               return 0;
> -       default:
> -               /* be safe, default to 1. list exceptions explicitly */
> -               return 1;
> -       }
> -}
> -
>  #ifdef CONFIG_ANON_VMA_NAME
>  struct anon_vma_name *anon_vma_name_alloc(const char *name)
>  {
> @@ -1486,6 +1467,43 @@ static bool process_madvise_remote_valid(int behavior)
>         }
>  }
>
> +/*
> + * Try to acquire a VMA read lock if possible.
> + *
> + * We only support this lock over a single VMA, which the input range must
> + * span.either partially or fully.
> + *
> + * This function always returns with an appropriate lock held. If a VMA read
> + * lock could be acquired, we return the locked VMA.
> + *
> + * If a VMA read lock could not be acquired, we return NULL and expect caller to

Worth mentioning that the function itself will fall back to taking
mmap_read_lock in such a case.

> + * fallback to mmap lock behaviour.
> + */
> +static struct vm_area_struct *try_vma_read_lock(struct mm_struct *mm,
> +               struct madvise_behavior *madv_behavior,
> +               unsigned long start, unsigned long end)
> +{
> +       struct vm_area_struct *vma;
> +
> +       if (!madv_behavior || madv_behavior->lock_mode != MADVISE_VMA_READ_LOCK)

nit: I think it would be better to do this check before calling
try_vma_read_lock(). IMHO it does not make sense to call
try_vma_read_lock() when lock_mode != MADVISE_VMA_READ_LOCK. It also
makes reading this function easier. The first time I looked at it and
saw "return NULL" in one place that takes mmap_read_lock() and another
place which returns the same NULL but does not take mmap_lock really
confused me.

> +               return NULL;
> +
> +       vma = lock_vma_under_rcu(mm, start);
> +       if (!vma)
> +               goto take_mmap_read_lock;
> +       /* We must span only a single VMA, uffd unsupported. */
> +       if (end > vma->vm_end || userfaultfd_armed(vma)) {

vma->vm_end is not inclusive, so the above condition I think should be
(end >= vma->vm_end || ...)

> +               vma_end_read(vma);
> +               goto take_mmap_read_lock;
> +       }
> +       return vma;
> +
> +take_mmap_read_lock:
> +       mmap_read_lock(mm);
> +       madv_behavior->lock_mode = MADVISE_MMAP_READ_LOCK;
> +       return NULL;
> +}
> +
>  /*
>   * Walk the vmas in range [start,end), and call the visit function on each one.
>   * The visit function will get start and end parameters that cover the overlap
> @@ -1496,7 +1514,8 @@ static bool process_madvise_remote_valid(int behavior)
>   */
>  static
>  int madvise_walk_vmas(struct mm_struct *mm, unsigned long start,
> -                     unsigned long end, void *arg,
> +                     unsigned long end, struct madvise_behavior *madv_behavior,
> +                     void *arg,
>                       int (*visit)(struct vm_area_struct *vma,
>                                    struct vm_area_struct **prev, unsigned long start,
>                                    unsigned long end, void *arg))
> @@ -1505,6 +1524,15 @@ int madvise_walk_vmas(struct mm_struct *mm, unsigned long start,
>         struct vm_area_struct *prev;
>         unsigned long tmp;
>         int unmapped_error = 0;
> +       int error;
> +
> +       /* If VMA read lock supported, we apply advice to a single VMA only. */
> +       vma = try_vma_read_lock(mm, madv_behavior, start, end);
> +       if (vma) {
> +               error = visit(vma, &prev, start, end, arg);
> +               vma_end_read(vma);
> +               return error;
> +       }
>
>         /*
>          * If the interval [start,end) covers some unmapped address
> @@ -1516,8 +1544,6 @@ int madvise_walk_vmas(struct mm_struct *mm, unsigned long start,
>                 prev = vma;
>
>         for (;;) {
> -               int error;
> -
>                 /* Still start < end. */
>                 if (!vma)
>                         return -ENOMEM;
> @@ -1598,34 +1624,86 @@ int madvise_set_anon_name(struct mm_struct *mm, unsigned long start,
>         if (end == start)
>                 return 0;
>
> -       return madvise_walk_vmas(mm, start, end, anon_name,
> +       return madvise_walk_vmas(mm, start, end, anon_name, NULL,
>                                  madvise_vma_anon_name);
>  }
>  #endif /* CONFIG_ANON_VMA_NAME */
>
> -static int madvise_lock(struct mm_struct *mm, int behavior)
> +
> +/*
> + * Any behaviour which results in changes to the vma->vm_flags needs to
> + * take mmap_lock for writing. Others, which simply traverse vmas, need
> + * to only take it for reading.
> + */
> +static enum madvise_lock_mode get_lock_mode(struct madvise_behavior *madv_behavior)
>  {
> +       int behavior = madv_behavior->behavior;
> +
>         if (is_memory_failure(behavior))
> -               return 0;
> +               return MADVISE_NO_LOCK;
>
> -       if (madvise_need_mmap_write(behavior)) {
> +       switch (behavior) {
> +       case MADV_REMOVE:
> +       case MADV_WILLNEED:
> +       case MADV_COLD:
> +       case MADV_PAGEOUT:
> +       case MADV_FREE:
> +       case MADV_POPULATE_READ:
> +       case MADV_POPULATE_WRITE:
> +       case MADV_COLLAPSE:
> +       case MADV_GUARD_INSTALL:
> +       case MADV_GUARD_REMOVE:
> +               return MADVISE_MMAP_READ_LOCK;
> +       case MADV_DONTNEED:
> +       case MADV_DONTNEED_LOCKED:
> +               return MADVISE_VMA_READ_LOCK;
> +       default:
> +               return MADVISE_MMAP_WRITE_LOCK;
> +       }
> +}
> +
> +static int madvise_lock(struct mm_struct *mm,
> +               struct madvise_behavior *madv_behavior)
> +{
> +       enum madvise_lock_mode lock_mode = get_lock_mode(madv_behavior);
> +
> +       switch (lock_mode) {
> +       case MADVISE_NO_LOCK:
> +               break;
> +       case MADVISE_MMAP_WRITE_LOCK:
>                 if (mmap_write_lock_killable(mm))
>                         return -EINTR;
> -       } else {
> +               break;
> +       case MADVISE_MMAP_READ_LOCK:
>                 mmap_read_lock(mm);
> +               break;
> +       case MADVISE_VMA_READ_LOCK:
> +               /* We will acquire the lock per-VMA in madvise_walk_vmas(). */
> +               break;
>         }
> +
> +       madv_behavior->lock_mode = lock_mode;
>         return 0;
>  }
>
> -static void madvise_unlock(struct mm_struct *mm, int behavior)
> +static void madvise_unlock(struct mm_struct *mm,
> +               struct madvise_behavior *madv_behavior)
>  {
> -       if (is_memory_failure(behavior))
> +       switch (madv_behavior->lock_mode) {
> +       case  MADVISE_NO_LOCK:
>                 return;
> -
> -       if (madvise_need_mmap_write(behavior))
> +       case MADVISE_MMAP_WRITE_LOCK:
>                 mmap_write_unlock(mm);
> -       else
> +               break;
> +       case MADVISE_MMAP_READ_LOCK:
>                 mmap_read_unlock(mm);
> +               break;
> +       case MADVISE_VMA_READ_LOCK:
> +               /* We will drop the lock per-VMA in madvise_walk_vmas(). */
> +               break;
> +       }
> +
> +       madv_behavior->lock_mode = MADVISE_NO_LOCK;
>  }
>
>  static bool madvise_batch_tlb_flush(int behavior)
> @@ -1721,6 +1799,8 @@ static int madvise_do_behavior(struct mm_struct *mm,
>
>         if (is_memory_failure(behavior))
>                 return madvise_inject_error(behavior, start, start + len_in);
> +
> +       // TODO: handle untagged stuff here...
>         start = untagged_addr(start); //untagged_addr_remote(mm, start);
>         end = start + PAGE_ALIGN(len_in);
>
> @@ -1729,7 +1809,7 @@ static int madvise_do_behavior(struct mm_struct *mm,
>                 error = madvise_populate(mm, start, end, behavior);
>         else
>                 error = madvise_walk_vmas(mm, start, end, madv_behavior,
> -                                         madvise_vma_behavior);
> +                                         madv_behavior, madvise_vma_behavior);
>         blk_finish_plug(&plug);
>         return error;
>  }
> @@ -1817,13 +1897,13 @@ int do_madvise(struct mm_struct *mm, unsigned long start, size_t len_in, int beh
>
>         if (madvise_should_skip(start, len_in, behavior, &error))
>                 return error;
> -       error = madvise_lock(mm, behavior);
> +       error = madvise_lock(mm, &madv_behavior);
>         if (error)
>                 return error;
>         madvise_init_tlb(&madv_behavior, mm);
>         error = madvise_do_behavior(mm, start, len_in, &madv_behavior);
>         madvise_finish_tlb(&madv_behavior);
> -       madvise_unlock(mm, behavior);
> +       madvise_unlock(mm, &madv_behavior);
>
>         return error;
>  }
> @@ -1847,7 +1927,7 @@ static ssize_t vector_madvise(struct mm_struct *mm, struct iov_iter *iter,
>
>         total_len = iov_iter_count(iter);
>
> -       ret = madvise_lock(mm, behavior);
> +       ret = madvise_lock(mm, &madv_behavior);
>         if (ret)
>                 return ret;
>         madvise_init_tlb(&madv_behavior, mm);
> @@ -1880,8 +1960,8 @@ static ssize_t vector_madvise(struct mm_struct *mm, struct iov_iter *iter,
>
>                         /* Drop and reacquire lock to unwind race. */
>                         madvise_finish_tlb(&madv_behavior);
> -                       madvise_unlock(mm, behavior);
> -                       ret = madvise_lock(mm, behavior);
> +                       madvise_unlock(mm, &madv_behavior);
> +                       ret = madvise_lock(mm, &madv_behavior);
>                         if (ret)
>                                 goto out;
>                         madvise_init_tlb(&madv_behavior, mm);
> @@ -1892,7 +1972,7 @@ static ssize_t vector_madvise(struct mm_struct *mm, struct iov_iter *iter,
>                 iov_iter_advance(iter, iter_iov_len(iter));
>         }
>         madvise_finish_tlb(&madv_behavior);
> -       madvise_unlock(mm, behavior);
> +       madvise_unlock(mm, &madv_behavior);
>
>  out:
>         ret = (total_len - iov_iter_count(iter)) ? : ret;
> --
> 2.49.0


^ permalink raw reply	[flat|nested] 35+ messages in thread

* Re: [PATCH RFC v2] mm: use per_vma lock for MADV_DONTNEED
  2025-06-03 18:43 ` Lorenzo Stoakes
  2025-06-03 20:17   ` Suren Baghdasaryan
@ 2025-06-03 20:59   ` Pedro Falcato
  2025-06-04  5:23     ` Lorenzo Stoakes
  1 sibling, 1 reply; 35+ messages in thread
From: Pedro Falcato @ 2025-06-03 20:59 UTC (permalink / raw)
  To: Lorenzo Stoakes
  Cc: Barry Song, akpm, linux-mm, linux-kernel, Barry Song,
	Liam R. Howlett, David Hildenbrand, Vlastimil Babka, Jann Horn,
	Suren Baghdasaryan, Lokesh Gidra, Tangquan Zheng

On Tue, Jun 03, 2025 at 07:43:04PM +0100, Lorenzo Stoakes wrote:
> Hi Barry,
> 
> As promised, I enclose a patch to give a sense of how I think we might
> thread state through this operation.
> 
> There's a todo on the untagged stuff so you can figure that out. This is
> based on the v1 so it might not encompass everything you addressed in the
> v2.
> 
> Passing in madv_behavior to madvise_walk_vmas() twice kinda sucks, I
> _despise_ the void *arg function ptr stuff there added just for the anon
> vma name stuff (ughhh) so might be the only sensible way of threading
> state.
> 
> I don't need any attribution, so please use this patch as you see
> fit/adapt/delete/do whatever with it, just an easier way for me to show the
> idea!
> 
> I did some very basic testing and it seems to work, but nothing deeper.
> 
> Cheers, Lorenzo
> 
> ----8<----
> >From ff4ba0115cb31a0630b6f8c02c68f11b3fb71f7a Mon Sep 17 00:00:00 2001
> From: Lorenzo Stoakes <lorenzo.stoakes@oracle.com>
> Date: Tue, 3 Jun 2025 18:22:55 +0100
> Subject: [PATCH] mm/madvise: support VMA read locks for MADV_DONTNEED[_LOCKED]
> 
> Refactor the madvise() code to retain state about the locking mode utilised
> for traversing VMAs.
> 
> Then use this mechanism to permit VMA locking to be done later in the
> madvise() logic and also to allow altering of the locking mode to permit
> falling back to an mmap read lock if required.
> 

Just as a quick drive-by comment: I was playing around with using per-vma locks
for GUP and mm_populate a few weeks ago. I never actually finished the work (and I
still plan on getting around doing it Eventually(tm)), but my final concept of an
approach was to simply read-lock every VMA in a range (if that fails, go back
to the mmap_lock).

I *think* it works, and doesn't have the same limitation for single VMAs.

I understand this is a super handwavy suggestion, but I know this discussion has
been happening and I just wanted to get this idea out of obscure IRC logs :)

-- 
Pedro


^ permalink raw reply	[flat|nested] 35+ messages in thread

* Re: [PATCH RFC v2] mm: use per_vma lock for MADV_DONTNEED
  2025-06-03 20:17   ` Suren Baghdasaryan
@ 2025-06-04  5:22     ` Lorenzo Stoakes
  2025-06-06  7:18     ` Barry Song
  1 sibling, 0 replies; 35+ messages in thread
From: Lorenzo Stoakes @ 2025-06-04  5:22 UTC (permalink / raw)
  To: Suren Baghdasaryan
  Cc: Barry Song, akpm, linux-mm, linux-kernel, Barry Song,
	Liam R. Howlett, David Hildenbrand, Vlastimil Babka, Jann Horn,
	Lokesh Gidra, Tangquan Zheng

These notes are for Barry to adapt according to his needs for this series,
but FWIW agreed on both counts! :)

On Tue, Jun 03, 2025 at 01:17:10PM -0700, Suren Baghdasaryan wrote:
> On Tue, Jun 3, 2025 at 11:43 AM Lorenzo Stoakes
> <lorenzo.stoakes@oracle.com> wrote:
> >
> > Hi Barry,
> >
> > As promised, I enclose a patch to give a sense of how I think we might
> > thread state through this operation.
> >
> > There's a todo on the untagged stuff so you can figure that out. This is
> > based on the v1 so it might not encompass everything you addressed in the
> > v2.
> >
> > Passing in madv_behavior to madvise_walk_vmas() twice kinda sucks, I
> > _despise_ the void *arg function ptr stuff there added just for the anon
> > vma name stuff (ughhh) so might be the only sensible way of threading
> > state.
> >
> > I don't need any attribution, so please use this patch as you see
> > fit/adapt/delete/do whatever with it, just an easier way for me to show the
> > idea!
> >
> > I did some very basic testing and it seems to work, but nothing deeper.
> >
> > Cheers, Lorenzo
> >
> > ----8<----
> > From ff4ba0115cb31a0630b6f8c02c68f11b3fb71f7a Mon Sep 17 00:00:00 2001
> > From: Lorenzo Stoakes <lorenzo.stoakes@oracle.com>
> > Date: Tue, 3 Jun 2025 18:22:55 +0100
> > Subject: [PATCH] mm/madvise: support VMA read locks for MADV_DONTNEED[_LOCKED]
> >
> > Refactor the madvise() code to retain state about the locking mode utilised
> > for traversing VMAs.
> >
> > Then use this mechanism to permit VMA locking to be done later in the
> > madvise() logic and also to allow altering of the locking mode to permit
> > falling back to an mmap read lock if required.
> >
> > Signed-off-by: Lorenzo Stoakes <lorenzo.stoakes@oracle.com>
> > ---
> >  mm/madvise.c | 174 +++++++++++++++++++++++++++++++++++++--------------
> >  1 file changed, 127 insertions(+), 47 deletions(-)
> >
> > diff --git a/mm/madvise.c b/mm/madvise.c
> > index 5f7a66a1617e..a3a6d73d0bd5 100644
> > --- a/mm/madvise.c
> > +++ b/mm/madvise.c
> > @@ -48,38 +48,19 @@ struct madvise_walk_private {
> >         bool pageout;
> >  };
> >
> > +enum madvise_lock_mode {
> > +       MADVISE_NO_LOCK,
> > +       MADVISE_MMAP_READ_LOCK,
> > +       MADVISE_MMAP_WRITE_LOCK,
> > +       MADVISE_VMA_READ_LOCK,
> > +};
> > +
> >  struct madvise_behavior {
> >         int behavior;
> >         struct mmu_gather *tlb;
> > +       enum madvise_lock_mode lock_mode;
> >  };
> >
> > -/*
> > - * Any behaviour which results in changes to the vma->vm_flags needs to
> > - * take mmap_lock for writing. Others, which simply traverse vmas, need
> > - * to only take it for reading.
> > - */
> > -static int madvise_need_mmap_write(int behavior)
> > -{
> > -       switch (behavior) {
> > -       case MADV_REMOVE:
> > -       case MADV_WILLNEED:
> > -       case MADV_DONTNEED:
> > -       case MADV_DONTNEED_LOCKED:
> > -       case MADV_COLD:
> > -       case MADV_PAGEOUT:
> > -       case MADV_FREE:
> > -       case MADV_POPULATE_READ:
> > -       case MADV_POPULATE_WRITE:
> > -       case MADV_COLLAPSE:
> > -       case MADV_GUARD_INSTALL:
> > -       case MADV_GUARD_REMOVE:
> > -               return 0;
> > -       default:
> > -               /* be safe, default to 1. list exceptions explicitly */
> > -               return 1;
> > -       }
> > -}
> > -
> >  #ifdef CONFIG_ANON_VMA_NAME
> >  struct anon_vma_name *anon_vma_name_alloc(const char *name)
> >  {
> > @@ -1486,6 +1467,43 @@ static bool process_madvise_remote_valid(int behavior)
> >         }
> >  }
> >
> > +/*
> > + * Try to acquire a VMA read lock if possible.
> > + *
> > + * We only support this lock over a single VMA, which the input range must
> > + * span.either partially or fully.
> > + *
> > + * This function always returns with an appropriate lock held. If a VMA read
> > + * lock could be acquired, we return the locked VMA.
> > + *
> > + * If a VMA read lock could not be acquired, we return NULL and expect caller to
>
> Worth mentioning that the function itself will fall back to taking
> mmap_read_lock in such a case.
>
> > + * fallback to mmap lock behaviour.
> > + */
> > +static struct vm_area_struct *try_vma_read_lock(struct mm_struct *mm,
> > +               struct madvise_behavior *madv_behavior,
> > +               unsigned long start, unsigned long end)
> > +{
> > +       struct vm_area_struct *vma;
> > +
> > +       if (!madv_behavior || madv_behavior->lock_mode != MADVISE_VMA_READ_LOCK)
>
> nit: I think it would be better to do this check before calling
> try_vma_read_lock(). IMHO it does not make sense to call
> try_vma_read_lock() when lock_mode != MADVISE_VMA_READ_LOCK. It also
> makes reading this function easier. The first time I looked at it and
> saw "return NULL" in one place that takes mmap_read_lock() and another
> place which returns the same NULL but does not take mmap_lock really
> confused me.
>
> > +               return NULL;
> > +
> > +       vma = lock_vma_under_rcu(mm, start);
> > +       if (!vma)
> > +               goto take_mmap_read_lock;
> > +       /* We must span only a single VMA, uffd unsupported. */
> > +       if (end > vma->vm_end || userfaultfd_armed(vma)) {
>
> vma->vm_end is not inclusive, so the above condition I think should be
> (end >= vma->vm_end || ...)
>
> > +               vma_end_read(vma);
> > +               goto take_mmap_read_lock;
> > +       }
> > +       return vma;
> > +
> > +take_mmap_read_lock:
> > +       mmap_read_lock(mm);
> > +       madv_behavior->lock_mode = MADVISE_MMAP_READ_LOCK;
> > +       return NULL;
> > +}
> > +
> >  /*
> >   * Walk the vmas in range [start,end), and call the visit function on each one.
> >   * The visit function will get start and end parameters that cover the overlap
> > @@ -1496,7 +1514,8 @@ static bool process_madvise_remote_valid(int behavior)
> >   */
> >  static
> >  int madvise_walk_vmas(struct mm_struct *mm, unsigned long start,
> > -                     unsigned long end, void *arg,
> > +                     unsigned long end, struct madvise_behavior *madv_behavior,
> > +                     void *arg,
> >                       int (*visit)(struct vm_area_struct *vma,
> >                                    struct vm_area_struct **prev, unsigned long start,
> >                                    unsigned long end, void *arg))
> > @@ -1505,6 +1524,15 @@ int madvise_walk_vmas(struct mm_struct *mm, unsigned long start,
> >         struct vm_area_struct *prev;
> >         unsigned long tmp;
> >         int unmapped_error = 0;
> > +       int error;
> > +
> > +       /* If VMA read lock supported, we apply advice to a single VMA only. */
> > +       vma = try_vma_read_lock(mm, madv_behavior, start, end);
> > +       if (vma) {
> > +               error = visit(vma, &prev, start, end, arg);
> > +               vma_end_read(vma);
> > +               return error;
> > +       }
> >
> >         /*
> >          * If the interval [start,end) covers some unmapped address
> > @@ -1516,8 +1544,6 @@ int madvise_walk_vmas(struct mm_struct *mm, unsigned long start,
> >                 prev = vma;
> >
> >         for (;;) {
> > -               int error;
> > -
> >                 /* Still start < end. */
> >                 if (!vma)
> >                         return -ENOMEM;
> > @@ -1598,34 +1624,86 @@ int madvise_set_anon_name(struct mm_struct *mm, unsigned long start,
> >         if (end == start)
> >                 return 0;
> >
> > -       return madvise_walk_vmas(mm, start, end, anon_name,
> > +       return madvise_walk_vmas(mm, start, end, anon_name, NULL,
> >                                  madvise_vma_anon_name);
> >  }
> >  #endif /* CONFIG_ANON_VMA_NAME */
> >
> > -static int madvise_lock(struct mm_struct *mm, int behavior)
> > +
> > +/*
> > + * Any behaviour which results in changes to the vma->vm_flags needs to
> > + * take mmap_lock for writing. Others, which simply traverse vmas, need
> > + * to only take it for reading.
> > + */
> > +static enum madvise_lock_mode get_lock_mode(struct madvise_behavior *madv_behavior)
> >  {
> > +       int behavior = madv_behavior->behavior;
> > +
> >         if (is_memory_failure(behavior))
> > -               return 0;
> > +               return MADVISE_NO_LOCK;
> >
> > -       if (madvise_need_mmap_write(behavior)) {
> > +       switch (behavior) {
> > +       case MADV_REMOVE:
> > +       case MADV_WILLNEED:
> > +       case MADV_COLD:
> > +       case MADV_PAGEOUT:
> > +       case MADV_FREE:
> > +       case MADV_POPULATE_READ:
> > +       case MADV_POPULATE_WRITE:
> > +       case MADV_COLLAPSE:
> > +       case MADV_GUARD_INSTALL:
> > +       case MADV_GUARD_REMOVE:
> > +               return MADVISE_MMAP_READ_LOCK;
> > +       case MADV_DONTNEED:
> > +       case MADV_DONTNEED_LOCKED:
> > +               return MADVISE_VMA_READ_LOCK;
> > +       default:
> > +               return MADVISE_MMAP_WRITE_LOCK;
> > +       }
> > +}
> > +
> > +static int madvise_lock(struct mm_struct *mm,
> > +               struct madvise_behavior *madv_behavior)
> > +{
> > +       enum madvise_lock_mode lock_mode = get_lock_mode(madv_behavior);
> > +
> > +       switch (lock_mode) {
> > +       case MADVISE_NO_LOCK:
> > +               break;
> > +       case MADVISE_MMAP_WRITE_LOCK:
> >                 if (mmap_write_lock_killable(mm))
> >                         return -EINTR;
> > -       } else {
> > +               break;
> > +       case MADVISE_MMAP_READ_LOCK:
> >                 mmap_read_lock(mm);
> > +               break;
> > +       case MADVISE_VMA_READ_LOCK:
> > +               /* We will acquire the lock per-VMA in madvise_walk_vmas(). */
> > +               break;
> >         }
> > +
> > +       madv_behavior->lock_mode = lock_mode;
> >         return 0;
> >  }
> >
> > -static void madvise_unlock(struct mm_struct *mm, int behavior)
> > +static void madvise_unlock(struct mm_struct *mm,
> > +               struct madvise_behavior *madv_behavior)
> >  {
> > -       if (is_memory_failure(behavior))
> > +       switch (madv_behavior->lock_mode) {
> > +       case  MADVISE_NO_LOCK:
> >                 return;
> > -
> > -       if (madvise_need_mmap_write(behavior))
> > +       case MADVISE_MMAP_WRITE_LOCK:
> >                 mmap_write_unlock(mm);
> > -       else
> > +               break;
> > +       case MADVISE_MMAP_READ_LOCK:
> >                 mmap_read_unlock(mm);
> > +               break;
> > +       case MADVISE_VMA_READ_LOCK:
> > +               /* We will drop the lock per-VMA in madvise_walk_vmas(). */
> > +               break;
> > +       }
> > +
> > +       madv_behavior->lock_mode = MADVISE_NO_LOCK;
> >  }
> >
> >  static bool madvise_batch_tlb_flush(int behavior)
> > @@ -1721,6 +1799,8 @@ static int madvise_do_behavior(struct mm_struct *mm,
> >
> >         if (is_memory_failure(behavior))
> >                 return madvise_inject_error(behavior, start, start + len_in);
> > +
> > +       // TODO: handle untagged stuff here...
> >         start = untagged_addr(start); //untagged_addr_remote(mm, start);
> >         end = start + PAGE_ALIGN(len_in);
> >
> > @@ -1729,7 +1809,7 @@ static int madvise_do_behavior(struct mm_struct *mm,
> >                 error = madvise_populate(mm, start, end, behavior);
> >         else
> >                 error = madvise_walk_vmas(mm, start, end, madv_behavior,
> > -                                         madvise_vma_behavior);
> > +                                         madv_behavior, madvise_vma_behavior);
> >         blk_finish_plug(&plug);
> >         return error;
> >  }
> > @@ -1817,13 +1897,13 @@ int do_madvise(struct mm_struct *mm, unsigned long start, size_t len_in, int beh
> >
> >         if (madvise_should_skip(start, len_in, behavior, &error))
> >                 return error;
> > -       error = madvise_lock(mm, behavior);
> > +       error = madvise_lock(mm, &madv_behavior);
> >         if (error)
> >                 return error;
> >         madvise_init_tlb(&madv_behavior, mm);
> >         error = madvise_do_behavior(mm, start, len_in, &madv_behavior);
> >         madvise_finish_tlb(&madv_behavior);
> > -       madvise_unlock(mm, behavior);
> > +       madvise_unlock(mm, &madv_behavior);
> >
> >         return error;
> >  }
> > @@ -1847,7 +1927,7 @@ static ssize_t vector_madvise(struct mm_struct *mm, struct iov_iter *iter,
> >
> >         total_len = iov_iter_count(iter);
> >
> > -       ret = madvise_lock(mm, behavior);
> > +       ret = madvise_lock(mm, &madv_behavior);
> >         if (ret)
> >                 return ret;
> >         madvise_init_tlb(&madv_behavior, mm);
> > @@ -1880,8 +1960,8 @@ static ssize_t vector_madvise(struct mm_struct *mm, struct iov_iter *iter,
> >
> >                         /* Drop and reacquire lock to unwind race. */
> >                         madvise_finish_tlb(&madv_behavior);
> > -                       madvise_unlock(mm, behavior);
> > -                       ret = madvise_lock(mm, behavior);
> > +                       madvise_unlock(mm, &madv_behavior);
> > +                       ret = madvise_lock(mm, &madv_behavior);
> >                         if (ret)
> >                                 goto out;
> >                         madvise_init_tlb(&madv_behavior, mm);
> > @@ -1892,7 +1972,7 @@ static ssize_t vector_madvise(struct mm_struct *mm, struct iov_iter *iter,
> >                 iov_iter_advance(iter, iter_iov_len(iter));
> >         }
> >         madvise_finish_tlb(&madv_behavior);
> > -       madvise_unlock(mm, behavior);
> > +       madvise_unlock(mm, &madv_behavior);
> >
> >  out:
> >         ret = (total_len - iov_iter_count(iter)) ? : ret;
> > --
> > 2.49.0


^ permalink raw reply	[flat|nested] 35+ messages in thread

* Re: [PATCH RFC v2] mm: use per_vma lock for MADV_DONTNEED
  2025-06-03 20:59   ` Pedro Falcato
@ 2025-06-04  5:23     ` Lorenzo Stoakes
  0 siblings, 0 replies; 35+ messages in thread
From: Lorenzo Stoakes @ 2025-06-04  5:23 UTC (permalink / raw)
  To: Pedro Falcato
  Cc: Barry Song, akpm, linux-mm, linux-kernel, Barry Song,
	Liam R. Howlett, David Hildenbrand, Vlastimil Babka, Jann Horn,
	Suren Baghdasaryan, Lokesh Gidra, Tangquan Zheng

On Tue, Jun 03, 2025 at 09:59:46PM +0100, Pedro Falcato wrote:
> On Tue, Jun 03, 2025 at 07:43:04PM +0100, Lorenzo Stoakes wrote:
> > Hi Barry,
> >
> > As promised, I enclose a patch to give a sense of how I think we might
> > thread state through this operation.
> >
> > There's a todo on the untagged stuff so you can figure that out. This is
> > based on the v1 so it might not encompass everything you addressed in the
> > v2.
> >
> > Passing in madv_behavior to madvise_walk_vmas() twice kinda sucks, I
> > _despise_ the void *arg function ptr stuff there added just for the anon
> > vma name stuff (ughhh) so might be the only sensible way of threading
> > state.
> >
> > I don't need any attribution, so please use this patch as you see
> > fit/adapt/delete/do whatever with it, just an easier way for me to show the
> > idea!
> >
> > I did some very basic testing and it seems to work, but nothing deeper.
> >
> > Cheers, Lorenzo
> >
> > ----8<----
> > >From ff4ba0115cb31a0630b6f8c02c68f11b3fb71f7a Mon Sep 17 00:00:00 2001
> > From: Lorenzo Stoakes <lorenzo.stoakes@oracle.com>
> > Date: Tue, 3 Jun 2025 18:22:55 +0100
> > Subject: [PATCH] mm/madvise: support VMA read locks for MADV_DONTNEED[_LOCKED]
> >
> > Refactor the madvise() code to retain state about the locking mode utilised
> > for traversing VMAs.
> >
> > Then use this mechanism to permit VMA locking to be done later in the
> > madvise() logic and also to allow altering of the locking mode to permit
> > falling back to an mmap read lock if required.
> >
>
> Just as a quick drive-by comment: I was playing around with using per-vma locks
> for GUP and mm_populate a few weeks ago. I never actually finished the work (and I
> still plan on getting around doing it Eventually(tm)), but my final concept of an
> approach was to simply read-lock every VMA in a range (if that fails, go back
> to the mmap_lock).
>
> I *think* it works, and doesn't have the same limitation for single VMAs.
>
> I understand this is a super handwavy suggestion, but I know this discussion has
> been happening and I just wanted to get this idea out of obscure IRC logs :)

Ack, and I was considering something like that, but it's one thing at a time :)

Doing things in a more generic/integrated way like this should make it
easier to incrementally change things.

>
> --
> Pedro


^ permalink raw reply	[flat|nested] 35+ messages in thread

* Re: [PATCH RFC v2] mm: use per_vma lock for MADV_DONTNEED
  2025-06-03  9:54     ` Lorenzo Stoakes
@ 2025-06-04  6:02       ` Qi Zheng
  2025-06-04 17:50         ` Lorenzo Stoakes
  0 siblings, 1 reply; 35+ messages in thread
From: Qi Zheng @ 2025-06-04  6:02 UTC (permalink / raw)
  To: Lorenzo Stoakes
  Cc: Jann Horn, Barry Song, akpm, linux-mm, linux-kernel, Barry Song,
	Liam R. Howlett, David Hildenbrand, Vlastimil Babka,
	Suren Baghdasaryan, Lokesh Gidra, Tangquan Zheng

Hi Lorenzo,

On 6/3/25 5:54 PM, Lorenzo Stoakes wrote:
> On Tue, Jun 03, 2025 at 03:24:28PM +0800, Qi Zheng wrote:
>> Hi Jann,
>>
>> On 5/30/25 10:06 PM, Jann Horn wrote:
>>> On Fri, May 30, 2025 at 12:44 PM Barry Song <21cnbao@gmail.com> wrote:
>>>> Certain madvise operations, especially MADV_DONTNEED, occur far more
>>>> frequently than other madvise options, particularly in native and Java
>>>> heaps for dynamic memory management.
>>>>
>>>> Currently, the mmap_lock is always held during these operations, even when
>>>> unnecessary. This causes lock contention and can lead to severe priority
>>>> inversion, where low-priority threads—such as Android's HeapTaskDaemon—
>>>> hold the lock and block higher-priority threads.
>>>>
>>>> This patch enables the use of per-VMA locks when the advised range lies
>>>> entirely within a single VMA, avoiding the need for full VMA traversal. In
>>>> practice, userspace heaps rarely issue MADV_DONTNEED across multiple VMAs.
>>>>
>>>> Tangquan’s testing shows that over 99.5% of memory reclaimed by Android
>>>> benefits from this per-VMA lock optimization. After extended runtime,
>>>> 217,735 madvise calls from HeapTaskDaemon used the per-VMA path, while
>>>> only 1,231 fell back to mmap_lock.
>>>>
>>>> To simplify handling, the implementation falls back to the standard
>>>> mmap_lock if userfaultfd is enabled on the VMA, avoiding the complexity of
>>>> userfaultfd_remove().
>>>
>>> One important quirk of this is that it can, from what I can see, cause
>>> freeing of page tables (through pt_reclaim) without holding the mmap
>>> lock at all:
>>>
>>> do_madvise [behavior=MADV_DONTNEED]
>>>     madvise_lock
>>>       lock_vma_under_rcu
>>>     madvise_do_behavior
>>>       madvise_single_locked_vma
>>>         madvise_vma_behavior
>>>           madvise_dontneed_free
>>>             madvise_dontneed_single_vma
>>>               zap_page_range_single_batched [.reclaim_pt = true]
>>>                 unmap_single_vma
>>>                   unmap_page_range
>>>                     zap_p4d_range
>>>                       zap_pud_range
>>>                         zap_pmd_range
>>>                           zap_pte_range
>>>                             try_get_and_clear_pmd
>>>                             free_pte
>>>
>>> This clashes with the assumption in walk_page_range_novma() that
>>> holding the mmap lock in write mode is sufficient to prevent
>>> concurrent page table freeing, so it can probably lead to page table
>>> UAF through the ptdump interface (see ptdump_walk_pgd()).
>>
>> Maybe not? The PTE page is freed via RCU in zap_pte_range(), so in the
>> following case:
>>
>> cpu 0				cpu 1
>>
>> ptdump_walk_pgd
>> --> walk_pte_range
>>      --> pte_offset_map (hold RCU read lock)
>> 				zap_pte_range
>> 				--> free_pte (via RCU)
>>          walk_pte_range_inner
>>          --> ptdump_pte_entry (the PTE page is not freed at this time)
>>
>> IIUC, there is no UAF issue here?
>>
>> If I missed anything please let me know.
>>
>> Thanks,
>> Qi
>>
>>
> 
> I forgot about that interesting placement of RCU lock acquisition :) I will
> obviously let Jann come back to you on this, but I wonder if I need to
> update the doc to reflect this actually.

I saw that there is already a relevant description in process_addrs.rst:


```
So accessing PTE-level page tables requires at least holding an RCU read 
lock;
   but that only suffices for readers that can tolerate racing with 
concurrent
   page table updates such that an empty PTE is observed (in a page 
table that
   has actually already been detached and marked for RCU freeing) while 
another
   new page table has been installed in the same location and filled with
   entries. Writers normally need to take the PTE lock and revalidate 
that the
   PMD entry still refers to the same PTE-level page table.
   If the writer does not care whether it is the same PTE-level page 
table, it
   can take the PMD lock and revalidate that the contents of pmd entry 
still meet
   the requirements. In particular, this also happens in 
:c:func:`!retract_page_tables`
   when handling :c:macro:`!MADV_COLLAPSE`.
```

Thanks!




^ permalink raw reply	[flat|nested] 35+ messages in thread

* Re: [PATCH RFC v2] mm: use per_vma lock for MADV_DONTNEED
  2025-06-04  6:02       ` Qi Zheng
@ 2025-06-04 17:50         ` Lorenzo Stoakes
  2025-06-05  3:23           ` Qi Zheng
  2025-06-06 11:07           ` Jann Horn
  0 siblings, 2 replies; 35+ messages in thread
From: Lorenzo Stoakes @ 2025-06-04 17:50 UTC (permalink / raw)
  To: Qi Zheng
  Cc: Jann Horn, Barry Song, akpm, linux-mm, linux-kernel, Barry Song,
	Liam R. Howlett, David Hildenbrand, Vlastimil Babka,
	Suren Baghdasaryan, Lokesh Gidra, Tangquan Zheng

On Wed, Jun 04, 2025 at 02:02:12PM +0800, Qi Zheng wrote:
> Hi Lorenzo,
>
> On 6/3/25 5:54 PM, Lorenzo Stoakes wrote:
> > On Tue, Jun 03, 2025 at 03:24:28PM +0800, Qi Zheng wrote:
> > > Hi Jann,
> > >
> > > On 5/30/25 10:06 PM, Jann Horn wrote:
> > > > On Fri, May 30, 2025 at 12:44 PM Barry Song <21cnbao@gmail.com> wrote:
> > > > > Certain madvise operations, especially MADV_DONTNEED, occur far more
> > > > > frequently than other madvise options, particularly in native and Java
> > > > > heaps for dynamic memory management.
> > > > >
> > > > > Currently, the mmap_lock is always held during these operations, even when
> > > > > unnecessary. This causes lock contention and can lead to severe priority
> > > > > inversion, where low-priority threads—such as Android's HeapTaskDaemon—
> > > > > hold the lock and block higher-priority threads.
> > > > >
> > > > > This patch enables the use of per-VMA locks when the advised range lies
> > > > > entirely within a single VMA, avoiding the need for full VMA traversal. In
> > > > > practice, userspace heaps rarely issue MADV_DONTNEED across multiple VMAs.
> > > > >
> > > > > Tangquan’s testing shows that over 99.5% of memory reclaimed by Android
> > > > > benefits from this per-VMA lock optimization. After extended runtime,
> > > > > 217,735 madvise calls from HeapTaskDaemon used the per-VMA path, while
> > > > > only 1,231 fell back to mmap_lock.
> > > > >
> > > > > To simplify handling, the implementation falls back to the standard
> > > > > mmap_lock if userfaultfd is enabled on the VMA, avoiding the complexity of
> > > > > userfaultfd_remove().
> > > >
> > > > One important quirk of this is that it can, from what I can see, cause
> > > > freeing of page tables (through pt_reclaim) without holding the mmap
> > > > lock at all:
> > > >
> > > > do_madvise [behavior=MADV_DONTNEED]
> > > >     madvise_lock
> > > >       lock_vma_under_rcu
> > > >     madvise_do_behavior
> > > >       madvise_single_locked_vma
> > > >         madvise_vma_behavior
> > > >           madvise_dontneed_free
> > > >             madvise_dontneed_single_vma
> > > >               zap_page_range_single_batched [.reclaim_pt = true]
> > > >                 unmap_single_vma
> > > >                   unmap_page_range
> > > >                     zap_p4d_range
> > > >                       zap_pud_range
> > > >                         zap_pmd_range
> > > >                           zap_pte_range
> > > >                             try_get_and_clear_pmd
> > > >                             free_pte
> > > >
> > > > This clashes with the assumption in walk_page_range_novma() that
> > > > holding the mmap lock in write mode is sufficient to prevent
> > > > concurrent page table freeing, so it can probably lead to page table
> > > > UAF through the ptdump interface (see ptdump_walk_pgd()).
> > >
> > > Maybe not? The PTE page is freed via RCU in zap_pte_range(), so in the
> > > following case:
> > >
> > > cpu 0				cpu 1
> > >
> > > ptdump_walk_pgd
> > > --> walk_pte_range
> > >      --> pte_offset_map (hold RCU read lock)
> > > 				zap_pte_range
> > > 				--> free_pte (via RCU)
> > >          walk_pte_range_inner
> > >          --> ptdump_pte_entry (the PTE page is not freed at this time)
> > >
> > > IIUC, there is no UAF issue here?
> > >
> > > If I missed anything please let me know.

Seems to me that we don't need the VMA locks then unless I'm missing
something? :) Jann?

Would this RCU-lock-acquired-by-pte_offset_map also save us from the
munmap() downgraded read lock scenario also? Or is the problem there
intermediate page table teardown I guess?

> > >
> > > Thanks,
> > > Qi
> > >
> > >
> >
> > I forgot about that interesting placement of RCU lock acquisition :) I will
> > obviously let Jann come back to you on this, but I wonder if I need to
> > update the doc to reflect this actually.
>
> I saw that there is already a relevant description in process_addrs.rst:
>
>
> ```
> So accessing PTE-level page tables requires at least holding an RCU read
> lock;
>   but that only suffices for readers that can tolerate racing with
> concurrent
>   page table updates such that an empty PTE is observed (in a page table
> that
>   has actually already been detached and marked for RCU freeing) while
> another
>   new page table has been installed in the same location and filled with
>   entries. Writers normally need to take the PTE lock and revalidate that
> the
>   PMD entry still refers to the same PTE-level page table.
>   If the writer does not care whether it is the same PTE-level page table,
> it
>   can take the PMD lock and revalidate that the contents of pmd entry still
> meet
>   the requirements. In particular, this also happens in
> :c:func:`!retract_page_tables`
>   when handling :c:macro:`!MADV_COLLAPSE`.
> ```
>
> Thanks!
>
>

Thanks I think you're right!


^ permalink raw reply	[flat|nested] 35+ messages in thread

* Re: [PATCH RFC v2] mm: use per_vma lock for MADV_DONTNEED
  2025-06-04 17:50         ` Lorenzo Stoakes
@ 2025-06-05  3:23           ` Qi Zheng
  2025-06-05 14:04             ` Lorenzo Stoakes
  2025-06-06 11:07           ` Jann Horn
  1 sibling, 1 reply; 35+ messages in thread
From: Qi Zheng @ 2025-06-05  3:23 UTC (permalink / raw)
  To: Lorenzo Stoakes
  Cc: Jann Horn, Barry Song, akpm, linux-mm, linux-kernel, Barry Song,
	Liam R. Howlett, David Hildenbrand, Vlastimil Babka,
	Suren Baghdasaryan, Lokesh Gidra, Tangquan Zheng



On 6/5/25 1:50 AM, Lorenzo Stoakes wrote:
> On Wed, Jun 04, 2025 at 02:02:12PM +0800, Qi Zheng wrote:
>> Hi Lorenzo,
>>
>> On 6/3/25 5:54 PM, Lorenzo Stoakes wrote:
>>> On Tue, Jun 03, 2025 at 03:24:28PM +0800, Qi Zheng wrote:
>>>> Hi Jann,
>>>>
>>>> On 5/30/25 10:06 PM, Jann Horn wrote:
>>>>> On Fri, May 30, 2025 at 12:44 PM Barry Song <21cnbao@gmail.com> wrote:
>>>>>> Certain madvise operations, especially MADV_DONTNEED, occur far more
>>>>>> frequently than other madvise options, particularly in native and Java
>>>>>> heaps for dynamic memory management.
>>>>>>
>>>>>> Currently, the mmap_lock is always held during these operations, even when
>>>>>> unnecessary. This causes lock contention and can lead to severe priority
>>>>>> inversion, where low-priority threads—such as Android's HeapTaskDaemon—
>>>>>> hold the lock and block higher-priority threads.
>>>>>>
>>>>>> This patch enables the use of per-VMA locks when the advised range lies
>>>>>> entirely within a single VMA, avoiding the need for full VMA traversal. In
>>>>>> practice, userspace heaps rarely issue MADV_DONTNEED across multiple VMAs.
>>>>>>
>>>>>> Tangquan’s testing shows that over 99.5% of memory reclaimed by Android
>>>>>> benefits from this per-VMA lock optimization. After extended runtime,
>>>>>> 217,735 madvise calls from HeapTaskDaemon used the per-VMA path, while
>>>>>> only 1,231 fell back to mmap_lock.
>>>>>>
>>>>>> To simplify handling, the implementation falls back to the standard
>>>>>> mmap_lock if userfaultfd is enabled on the VMA, avoiding the complexity of
>>>>>> userfaultfd_remove().
>>>>>
>>>>> One important quirk of this is that it can, from what I can see, cause
>>>>> freeing of page tables (through pt_reclaim) without holding the mmap
>>>>> lock at all:
>>>>>
>>>>> do_madvise [behavior=MADV_DONTNEED]
>>>>>      madvise_lock
>>>>>        lock_vma_under_rcu
>>>>>      madvise_do_behavior
>>>>>        madvise_single_locked_vma
>>>>>          madvise_vma_behavior
>>>>>            madvise_dontneed_free
>>>>>              madvise_dontneed_single_vma
>>>>>                zap_page_range_single_batched [.reclaim_pt = true]
>>>>>                  unmap_single_vma
>>>>>                    unmap_page_range
>>>>>                      zap_p4d_range
>>>>>                        zap_pud_range
>>>>>                          zap_pmd_range
>>>>>                            zap_pte_range
>>>>>                              try_get_and_clear_pmd
>>>>>                              free_pte
>>>>>
>>>>> This clashes with the assumption in walk_page_range_novma() that
>>>>> holding the mmap lock in write mode is sufficient to prevent
>>>>> concurrent page table freeing, so it can probably lead to page table
>>>>> UAF through the ptdump interface (see ptdump_walk_pgd()).
>>>>
>>>> Maybe not? The PTE page is freed via RCU in zap_pte_range(), so in the
>>>> following case:
>>>>
>>>> cpu 0				cpu 1
>>>>
>>>> ptdump_walk_pgd
>>>> --> walk_pte_range
>>>>       --> pte_offset_map (hold RCU read lock)
>>>> 				zap_pte_range
>>>> 				--> free_pte (via RCU)
>>>>           walk_pte_range_inner
>>>>           --> ptdump_pte_entry (the PTE page is not freed at this time)
>>>>
>>>> IIUC, there is no UAF issue here?
>>>>
>>>> If I missed anything please let me know.
> 
> Seems to me that we don't need the VMA locks then unless I'm missing
> something? :) Jann?
> 
> Would this RCU-lock-acquired-by-pte_offset_map also save us from the
> munmap() downgraded read lock scenario also? Or is the problem there
> intermediate page table teardown I guess?
> 

Right. Currently, page table pages other than PTE pages are not
protected by RCU, so mmap write lock still needed in the munmap path
to wait for all readers of the page table pages to exit the critical
section.

In other words, once we have achieved that all page table pages are
protected by RCU, we can completely remove the page table pages from
the protection of mmap locks.

Here are some of my previous thoughts:

```
Another plan
============

Currently, page table modification are protected by page table locks
(page_table_lock or split pmd/pte lock), but the life cycle of page
table pages are protected by mmap_lock (and vma lock). For more details,
please refer to the latest added Documentation/mm/process_addrs.rst file.

Currently we try to free the PTE pages through RCU when
CONFIG_PT_RECLAIM is turned on. In this case, we will no longer
need to hold mmap_lock for the read/write op on the PTE pages.

So maybe we can remove the page table from the protection of the mmap
lock (which is too big), like this:

1. free all levels of page table pages by RCU, not just PTE pages, but
    also pmd, pud, etc.
2. similar to pte_offset_map/pte_unmap, add
    [pmd|pud]_offset_map/[pmd|pud]_unmap, and make them all contain
    rcu_read_lock/rcu_read_unlcok, and make them accept failure.

In this way, we no longer need the mmap lock. For readers, such as page
table wallers, we are already in the critical section of RCU. For
writers, we only need to hold the page table lock.

But there is a difficulty here, that is, the RCU critical section is not
allowed to sleep, but it is possible to sleep in the callback function
of .pmd_entry, such as mmu_notifier_invalidate_range_start().

Use SRCU instead? Or use RCU + refcount method? Not sure. But I think
it's an interesting thing to try.
```

Thanks!




^ permalink raw reply	[flat|nested] 35+ messages in thread

* Re: [PATCH RFC v2] mm: use per_vma lock for MADV_DONTNEED
  2025-06-03 16:52         ` Jann Horn
@ 2025-06-05 10:27           ` Barry Song
  0 siblings, 0 replies; 35+ messages in thread
From: Barry Song @ 2025-06-05 10:27 UTC (permalink / raw)
  To: Jann Horn
  Cc: Lorenzo Stoakes, akpm, linux-mm, linux-kernel, Barry Song,
	Liam R. Howlett, David Hildenbrand, Vlastimil Babka,
	Suren Baghdasaryan, Lokesh Gidra, Tangquan Zheng

On Wed, Jun 4, 2025 at 4:53 AM Jann Horn <jannh@google.com> wrote:
>
> On Tue, Jun 3, 2025 at 9:06 AM Barry Song <21cnbao@gmail.com> wrote:
> > On Sat, May 31, 2025 at 8:41 AM Jann Horn <jannh@google.com> wrote:
> > > On Fri, May 30, 2025 at 4:34 PM Lorenzo Stoakes
> > > <lorenzo.stoakes@oracle.com> wrote:
> > > > On Fri, May 30, 2025 at 04:06:30PM +0200, Jann Horn wrote:
> > > > > On Fri, May 30, 2025 at 12:44 PM Barry Song <21cnbao@gmail.com> wrote:
> > > > > One important quirk of this is that it can, from what I can see, cause
> > > > > freeing of page tables (through pt_reclaim) without holding the mmap
> > > > > lock at all:
> > > > >
> > > > > do_madvise [behavior=MADV_DONTNEED]
> > > > >   madvise_lock
> > > > >     lock_vma_under_rcu
> > > > >   madvise_do_behavior
> > > > >     madvise_single_locked_vma
> > > > >       madvise_vma_behavior
> > > > >         madvise_dontneed_free
> > > > >           madvise_dontneed_single_vma
> > > > >             zap_page_range_single_batched [.reclaim_pt = true]
> > > > >               unmap_single_vma
> > > > >                 unmap_page_range
> > > > >                   zap_p4d_range
> > > > >                     zap_pud_range
> > > > >                       zap_pmd_range
> > > > >                         zap_pte_range
> > > > >                           try_get_and_clear_pmd
> > > > >                           free_pte
> > > > >
> > > > > This clashes with the assumption in walk_page_range_novma() that
> > > > > holding the mmap lock in write mode is sufficient to prevent
> > > > > concurrent page table freeing, so it can probably lead to page table
> > > > > UAF through the ptdump interface (see ptdump_walk_pgd()).
> > > >
> > > > Hmmmmmm is this because of the series that allows page table freeing on
> > > > zap... I think Zi's?
> > >
> > > Yeah, that was Qi Zheng's
> > > https://lore.kernel.org/all/92aba2b319a734913f18ba41e7d86a265f0b84e2.1733305182.git.zhengqi.arch@bytedance.com/
> > > .
> > >
> > > > We need to update the documentation on this then... which currently states
> > > > the VMA need only be stable.
> > > >
> > > > I guess this is still the case except for the novma walker you mention.
> > > >
> > > > Relatedly, It's worth looking at Dev's series which introduces a concerning
> > > > new 'no lock at all' mode to the page table walker explicitly for novma. I
> > > > cc'd you :) See [0].
> > > >
> > > > [0]: https://lore.kernel.org/linux-mm/6a60c052-9935-489e-a38e-1b03a1a79155@lucifer.local/
> > >
> > > Yeah, I saw that you CC'ed me; at a first glance that seems relatively
> > > innocuous to me as long as it's only done for kernel mappings where
> > > all the rules are different.
> > >
> > > >
> > > > >
> > > > > I think before this patch can land, you'll have to introduce some new
> > > > > helper like:
> > > > >
> > > > > void mmap_write_lock_with_all_vmas(struct mm_struct *mm)
> > > > > {
> > > > >   mmap_write_lock(mm);
> > > > >   for_each_vma(vmi, vma)
> > > > >     vma_start_write(vma);
> > > > > }
> > > > >
> > > > > and use that in walk_page_range_novma() for user virtual address space
> > > > > walks, and update the comment in there.
> > > >
> > > > What dude? No, what? Marking literally all VMAs write locked? :/
> > > >
> > > > I think this could have unexpected impact no? We're basically disabling VMA
> > > > locking when we're in novma, that seems... really silly?
> > >
> > > I mean, walk_page_range_novma() being used on user virtual address
> > > space is pretty much a debug-only thing, I don't think it matters if
> > > it has to spend time poking flags in a few thousand VMAs. I guess the
> > > alternative would be to say "ptdump just doesn't show entries between
> > > VMAs, which shouldn't exist in the first place", and change ptdump to
> > > do a normal walk that skips over userspace areas not covered by a VMA.
> > > Maybe that's cleaner.
> > >
> > > But FWIW, we already do worse than what I proposed here when
> > > installing MMU notifiers, with mm_take_all_locks().
> > >
> > > > > > +       else
> > > > > > +               __madvise_unlock(mm, madv_behavior->behavior);
> > > > > > +}
> > > > > > +
> > > > > >  static bool madvise_batch_tlb_flush(int behavior)
> > > > > >  {
> > > > > >         switch (behavior) {
> > > > > > @@ -1714,19 +1770,24 @@ static int madvise_do_behavior(struct mm_struct *mm,
> > > > > >                 unsigned long start, size_t len_in,
> > > > > >                 struct madvise_behavior *madv_behavior)
> > > > > >  {
> > > > > > +       struct vm_area_struct *vma = madv_behavior->vma;
> > > > > >         int behavior = madv_behavior->behavior;
> > > > > > +
> > > > > >         struct blk_plug plug;
> > > > > >         unsigned long end;
> > > > > >         int error;
> > > > > >
> > > > > >         if (is_memory_failure(behavior))
> > > > > >                 return madvise_inject_error(behavior, start, start + len_in);
> > > > > > -       start = untagged_addr_remote(mm, start);
> > > > > > +       start = untagged_addr(start);
> > > > >
> > > > > Why is this okay? I see that X86's untagged_addr_remote() asserts that
> > > > > the mmap lock is held, which is no longer the case here with your
> > > > > patch, but untagged_addr() seems wrong here, since we can be operating
> > > > > on another process. I think especially on X86 with 5-level paging and
> > > > > LAM, there can probably be cases where address bits are used for part
> > > > > of the virtual address in one task while they need to be masked off in
> > > > > another task?
> > > > >
> > > > > I wonder if you'll have to refactor X86 and Risc-V first to make this
> > > > > work... ideally by making sure that their address tagging state
> > > > > updates are atomic and untagged_area_remote() works locklessly.
> > > >
> > > > Yeah I don't know why we're doing this at all? This seems new unless I
> > > > missed it?
> > >
> > > Because untagged_addr_remote() has a mmap_assert_locked(mm) on x86 and
> > > reads data that is updated under the mmap lock, I think? So without
> > > this change you should get a lockdep splat on x86.
> > >
> > > > > (Or you could try to use something like the
> > > > > mmap_write_lock_with_all_vmas() I proposed above for synchronizing
> > > > > against untagged_addr(), first write-lock the MM and then write-lock
> > > > > all VMAs in it...)
> > > >
> > > > This would completely eliminate the point of this patch no? The whole point
> > > > is not taking these locks... And I'm very much not in favour of
> > > > write-locking literally every single VMA. under any circumstances.
> > >
> > > I'm talking about doing this heavyweight locking in places like
> > > arch_prctl(ARCH_ENABLE_TAGGED_ADDR, ...) that can, if I understand
> > > correctly, essentially reconfigure the size of the virtual address
> > > space of a running process from 56-bit to 47-bit at the hardware level
> > > and cause address bits that were previously part of the virtual
> > > address to be ignored. READ_ONCE()/WRITE_ONCE() might do the job too,
> > > but then we'll have to keep in mind that two subsequent invocations of
> > > untagged_addr() can translate a userspace-specified virtual address
> > > into two different virtual addresses at the page table level.
> >
> > I’m confused about how arch_prctl(ARCH_ENABLE_TAGGED_ADDR, ...) can
> > reconfigure a running process from using 56-bit addresses to 47-bit.
> > I read the code and see the x86 kernel only supports LAM U57, and not
> > LAM U48 at all:
> >
> > static int prctl_enable_tagged_addr(struct mm_struct *mm, unsigned long nr_bits)
> > {
> >         ...
> >
> >         if (!nr_bits || nr_bits > LAM_U57_BITS) {
> >                 mmap_write_unlock(mm);
> >                 return -EINVAL;
> >         }
> >
> >         mm_enable_lam(mm);
> >         mmap_write_unlock(mm);
> >
> >         return 0;
> > }
>
> Oh, you're right, currently only LAM U57 is supported by Linux, I was
> making bad assumptions.
>
> commit 2f8794bd087e, which introduced that code, also mentions that
> "For now only LAM_U57 is supported, with 6 tag bits".
>
> > I still don't fully understand why x86 differs from ARM64,
> > where the same bit mask is always applied unconditionally.
> >
> > On ARM64, we can even enable or disable PROT_MTE on a per-VMA basis
> > using mmap or mprotect. However, the same bitmask operation is
> > always executed regardless of whether memory tags are present for a
> > given VMA.
>
> Hmm, true, that does look like a weird difference.
>
> > I mean, on arm64, if a process or a VMA doesn't have tag access
> > enabled, and we pass an address with high bits to madvise,
> > untagged_addr() will still strip the tag. But wouldn't that address
> > be invalid for a process or VMA that doesn't have TBI enabled?
>
> Yeah, I guess in that regard it might be fine to always strip the bits...
>
> Maybe the situation on arm64 is simpler, in that these bits are always
> either tag bits or unused on arm64, while my understanding is that on
> X86, the CPU supports changing the meaning of address bits between
> "part of virtual address" and "ignored tag bits". So I think stripping
> the bits might work fine for LAM U57, but not for LAM U48, and maybe
> the code is trying to be future-proof in case someone wants to add
> support for LAM U48?

Perhaps supporting two different tag lengths is a bug rather than a feature?

>
> It is arguably also a bit more robust to reject garbage addresses
> instead of ignoring bits that would cause the CPU to treat the address
> as noncanonical, but I guess doing that just in MM code is not a big
> problem. (Unless someone relies on seccomp filters that block
> manipulation of specific virtual address ranges via munmap() and such,
> but I think almost nobody does that. I think I've only seen that once
> in some rarely-used component of the Tor project.)
>

Unless we perform a strict check in the kernel to ensure the allocated tag
matches the logical tag, random high-bit values are still acceptable.
For example, x86 only checks if the mask is -1UL or above 57 bits—
there’s no mechanism to exclude arbitrary high-bit values.

Otherwise, there's no point in checking whether tagging is enabled for the VMA
or the mm. Thus, we end up applying a mask unconditionally anyway.

> But I am out of my depth here and might be severely misunderstanding
> what's going on.

Same here :-) Especially since I have no idea what's going on with RISC-V
with two different tagging lengths.

Thanks
Barry


^ permalink raw reply	[flat|nested] 35+ messages in thread

* Re: [PATCH RFC v2] mm: use per_vma lock for MADV_DONTNEED
  2025-06-05  3:23           ` Qi Zheng
@ 2025-06-05 14:04             ` Lorenzo Stoakes
  2025-06-06  3:55               ` Qi Zheng
  0 siblings, 1 reply; 35+ messages in thread
From: Lorenzo Stoakes @ 2025-06-05 14:04 UTC (permalink / raw)
  To: Qi Zheng
  Cc: Jann Horn, Barry Song, akpm, linux-mm, linux-kernel, Barry Song,
	Liam R. Howlett, David Hildenbrand, Vlastimil Babka,
	Suren Baghdasaryan, Lokesh Gidra, Tangquan Zheng

On Thu, Jun 05, 2025 at 11:23:18AM +0800, Qi Zheng wrote:
>
>
> On 6/5/25 1:50 AM, Lorenzo Stoakes wrote:
> > On Wed, Jun 04, 2025 at 02:02:12PM +0800, Qi Zheng wrote:
> > > Hi Lorenzo,
> > >
> > > On 6/3/25 5:54 PM, Lorenzo Stoakes wrote:
> > > > On Tue, Jun 03, 2025 at 03:24:28PM +0800, Qi Zheng wrote:
> > > > > Hi Jann,
> > > > >
> > > > > On 5/30/25 10:06 PM, Jann Horn wrote:
> > > > > > On Fri, May 30, 2025 at 12:44 PM Barry Song <21cnbao@gmail.com> wrote:
> > > > > > > Certain madvise operations, especially MADV_DONTNEED, occur far more
> > > > > > > frequently than other madvise options, particularly in native and Java
> > > > > > > heaps for dynamic memory management.
> > > > > > >
> > > > > > > Currently, the mmap_lock is always held during these operations, even when
> > > > > > > unnecessary. This causes lock contention and can lead to severe priority
> > > > > > > inversion, where low-priority threads—such as Android's HeapTaskDaemon—
> > > > > > > hold the lock and block higher-priority threads.
> > > > > > >
> > > > > > > This patch enables the use of per-VMA locks when the advised range lies
> > > > > > > entirely within a single VMA, avoiding the need for full VMA traversal. In
> > > > > > > practice, userspace heaps rarely issue MADV_DONTNEED across multiple VMAs.
> > > > > > >
> > > > > > > Tangquan’s testing shows that over 99.5% of memory reclaimed by Android
> > > > > > > benefits from this per-VMA lock optimization. After extended runtime,
> > > > > > > 217,735 madvise calls from HeapTaskDaemon used the per-VMA path, while
> > > > > > > only 1,231 fell back to mmap_lock.
> > > > > > >
> > > > > > > To simplify handling, the implementation falls back to the standard
> > > > > > > mmap_lock if userfaultfd is enabled on the VMA, avoiding the complexity of
> > > > > > > userfaultfd_remove().
> > > > > >
> > > > > > One important quirk of this is that it can, from what I can see, cause
> > > > > > freeing of page tables (through pt_reclaim) without holding the mmap
> > > > > > lock at all:
> > > > > >
> > > > > > do_madvise [behavior=MADV_DONTNEED]
> > > > > >      madvise_lock
> > > > > >        lock_vma_under_rcu
> > > > > >      madvise_do_behavior
> > > > > >        madvise_single_locked_vma
> > > > > >          madvise_vma_behavior
> > > > > >            madvise_dontneed_free
> > > > > >              madvise_dontneed_single_vma
> > > > > >                zap_page_range_single_batched [.reclaim_pt = true]
> > > > > >                  unmap_single_vma
> > > > > >                    unmap_page_range
> > > > > >                      zap_p4d_range
> > > > > >                        zap_pud_range
> > > > > >                          zap_pmd_range
> > > > > >                            zap_pte_range
> > > > > >                              try_get_and_clear_pmd
> > > > > >                              free_pte
> > > > > >
> > > > > > This clashes with the assumption in walk_page_range_novma() that
> > > > > > holding the mmap lock in write mode is sufficient to prevent
> > > > > > concurrent page table freeing, so it can probably lead to page table
> > > > > > UAF through the ptdump interface (see ptdump_walk_pgd()).
> > > > >
> > > > > Maybe not? The PTE page is freed via RCU in zap_pte_range(), so in the
> > > > > following case:
> > > > >
> > > > > cpu 0				cpu 1
> > > > >
> > > > > ptdump_walk_pgd
> > > > > --> walk_pte_range
> > > > >       --> pte_offset_map (hold RCU read lock)
> > > > > 				zap_pte_range
> > > > > 				--> free_pte (via RCU)
> > > > >           walk_pte_range_inner
> > > > >           --> ptdump_pte_entry (the PTE page is not freed at this time)
> > > > >
> > > > > IIUC, there is no UAF issue here?
> > > > >
> > > > > If I missed anything please let me know.
> >
> > Seems to me that we don't need the VMA locks then unless I'm missing
> > something? :) Jann?
> >
> > Would this RCU-lock-acquired-by-pte_offset_map also save us from the
> > munmap() downgraded read lock scenario also? Or is the problem there
> > intermediate page table teardown I guess?
> >
>
> Right. Currently, page table pages other than PTE pages are not
> protected by RCU, so mmap write lock still needed in the munmap path
> to wait for all readers of the page table pages to exit the critical
> section.
>
> In other words, once we have achieved that all page table pages are
> protected by RCU, we can completely remove the page table pages from
> the protection of mmap locks.

Interesting - so on reclaim/migrate we are just clearing PTE entries with
the rmap lock right? Would this lead to a future where we could also tear
down page tables there?

Another point to remember is that when we are clearing down higher level
page tables in the general case, the logic assumes nothing else can touch
anything... we hold both rmap lock AND mmap/vma locks at this point.

But I guess if we're RCU-safe, we're same even from rmap right?

>
> Here are some of my previous thoughts:
>
> ```
> Another plan
> ============
>
> Currently, page table modification are protected by page table locks
> (page_table_lock or split pmd/pte lock), but the life cycle of page
> table pages are protected by mmap_lock (and vma lock). For more details,
> please refer to the latest added Documentation/mm/process_addrs.rst file.
>
> Currently we try to free the PTE pages through RCU when
> CONFIG_PT_RECLAIM is turned on. In this case, we will no longer
> need to hold mmap_lock for the read/write op on the PTE pages.
>
> So maybe we can remove the page table from the protection of the mmap
> lock (which is too big), like this:
>
> 1. free all levels of page table pages by RCU, not just PTE pages, but
>    also pmd, pud, etc.
> 2. similar to pte_offset_map/pte_unmap, add
>    [pmd|pud]_offset_map/[pmd|pud]_unmap, and make them all contain
>    rcu_read_lock/rcu_read_unlcok, and make them accept failure.
>
> In this way, we no longer need the mmap lock. For readers, such as page
> table wallers, we are already in the critical section of RCU. For
> writers, we only need to hold the page table lock.
>
> But there is a difficulty here, that is, the RCU critical section is not
> allowed to sleep, but it is possible to sleep in the callback function
> of .pmd_entry, such as mmu_notifier_invalidate_range_start().
>
> Use SRCU instead? Or use RCU + refcount method? Not sure. But I think
> it's an interesting thing to try.

Thanks for the information, RCU freeing of page tables is something of a
long-term TODO discussed back and forth :) might take a look myself if
somebody else hasn't grabbed when I have a second...

Is it _only_ the mmu notifier sleeping in this scenario? Or are there other
examples?

We could in theory always add another callback .pmd_entry_sleep or
something for this one case and document the requirement...

> ```
>
> Thanks!
>
>

Cheers, Lorenzo


^ permalink raw reply	[flat|nested] 35+ messages in thread

* Re: [PATCH RFC v2] mm: use per_vma lock for MADV_DONTNEED
  2025-06-05 14:04             ` Lorenzo Stoakes
@ 2025-06-06  3:55               ` Qi Zheng
  2025-06-06 10:44                 ` Lorenzo Stoakes
  0 siblings, 1 reply; 35+ messages in thread
From: Qi Zheng @ 2025-06-06  3:55 UTC (permalink / raw)
  To: Lorenzo Stoakes
  Cc: Jann Horn, Barry Song, akpm, linux-mm, linux-kernel, Barry Song,
	Liam R. Howlett, David Hildenbrand, Vlastimil Babka,
	Suren Baghdasaryan, Lokesh Gidra, Tangquan Zheng

Hi Lorenzo,

On 6/5/25 10:04 PM, Lorenzo Stoakes wrote:
> On Thu, Jun 05, 2025 at 11:23:18AM +0800, Qi Zheng wrote:
>>
>>
>> On 6/5/25 1:50 AM, Lorenzo Stoakes wrote:
>>> On Wed, Jun 04, 2025 at 02:02:12PM +0800, Qi Zheng wrote:
>>>> Hi Lorenzo,
>>>>
>>>> On 6/3/25 5:54 PM, Lorenzo Stoakes wrote:
>>>>> On Tue, Jun 03, 2025 at 03:24:28PM +0800, Qi Zheng wrote:
>>>>>> Hi Jann,
>>>>>>
>>>>>> On 5/30/25 10:06 PM, Jann Horn wrote:
>>>>>>> On Fri, May 30, 2025 at 12:44 PM Barry Song <21cnbao@gmail.com> wrote:
>>>>>>>> Certain madvise operations, especially MADV_DONTNEED, occur far more
>>>>>>>> frequently than other madvise options, particularly in native and Java
>>>>>>>> heaps for dynamic memory management.
>>>>>>>>
>>>>>>>> Currently, the mmap_lock is always held during these operations, even when
>>>>>>>> unnecessary. This causes lock contention and can lead to severe priority
>>>>>>>> inversion, where low-priority threads—such as Android's HeapTaskDaemon—
>>>>>>>> hold the lock and block higher-priority threads.
>>>>>>>>
>>>>>>>> This patch enables the use of per-VMA locks when the advised range lies
>>>>>>>> entirely within a single VMA, avoiding the need for full VMA traversal. In
>>>>>>>> practice, userspace heaps rarely issue MADV_DONTNEED across multiple VMAs.
>>>>>>>>
>>>>>>>> Tangquan’s testing shows that over 99.5% of memory reclaimed by Android
>>>>>>>> benefits from this per-VMA lock optimization. After extended runtime,
>>>>>>>> 217,735 madvise calls from HeapTaskDaemon used the per-VMA path, while
>>>>>>>> only 1,231 fell back to mmap_lock.
>>>>>>>>
>>>>>>>> To simplify handling, the implementation falls back to the standard
>>>>>>>> mmap_lock if userfaultfd is enabled on the VMA, avoiding the complexity of
>>>>>>>> userfaultfd_remove().
>>>>>>>
>>>>>>> One important quirk of this is that it can, from what I can see, cause
>>>>>>> freeing of page tables (through pt_reclaim) without holding the mmap
>>>>>>> lock at all:
>>>>>>>
>>>>>>> do_madvise [behavior=MADV_DONTNEED]
>>>>>>>       madvise_lock
>>>>>>>         lock_vma_under_rcu
>>>>>>>       madvise_do_behavior
>>>>>>>         madvise_single_locked_vma
>>>>>>>           madvise_vma_behavior
>>>>>>>             madvise_dontneed_free
>>>>>>>               madvise_dontneed_single_vma
>>>>>>>                 zap_page_range_single_batched [.reclaim_pt = true]
>>>>>>>                   unmap_single_vma
>>>>>>>                     unmap_page_range
>>>>>>>                       zap_p4d_range
>>>>>>>                         zap_pud_range
>>>>>>>                           zap_pmd_range
>>>>>>>                             zap_pte_range
>>>>>>>                               try_get_and_clear_pmd
>>>>>>>                               free_pte
>>>>>>>
>>>>>>> This clashes with the assumption in walk_page_range_novma() that
>>>>>>> holding the mmap lock in write mode is sufficient to prevent
>>>>>>> concurrent page table freeing, so it can probably lead to page table
>>>>>>> UAF through the ptdump interface (see ptdump_walk_pgd()).
>>>>>>
>>>>>> Maybe not? The PTE page is freed via RCU in zap_pte_range(), so in the
>>>>>> following case:
>>>>>>
>>>>>> cpu 0				cpu 1
>>>>>>
>>>>>> ptdump_walk_pgd
>>>>>> --> walk_pte_range
>>>>>>        --> pte_offset_map (hold RCU read lock)
>>>>>> 				zap_pte_range
>>>>>> 				--> free_pte (via RCU)
>>>>>>            walk_pte_range_inner
>>>>>>            --> ptdump_pte_entry (the PTE page is not freed at this time)
>>>>>>
>>>>>> IIUC, there is no UAF issue here?
>>>>>>
>>>>>> If I missed anything please let me know.
>>>
>>> Seems to me that we don't need the VMA locks then unless I'm missing
>>> something? :) Jann?
>>>
>>> Would this RCU-lock-acquired-by-pte_offset_map also save us from the
>>> munmap() downgraded read lock scenario also? Or is the problem there
>>> intermediate page table teardown I guess?
>>>
>>
>> Right. Currently, page table pages other than PTE pages are not
>> protected by RCU, so mmap write lock still needed in the munmap path
>> to wait for all readers of the page table pages to exit the critical
>> section.
>>
>> In other words, once we have achieved that all page table pages are
>> protected by RCU, we can completely remove the page table pages from
>> the protection of mmap locks.
> 
> Interesting - so on reclaim/migrate we are just clearing PTE entries with
> the rmap lock right? Would this lead to a future where we could also tear
> down page tables there?
> 
> Another point to remember is that when we are clearing down higher level
> page tables in the general case, the logic assumes nothing else can touch
> anything... we hold both rmap lock AND mmap/vma locks at this point.
> 
> But I guess if we're RCU-safe, we're same even from rmap right?

Yeah, and we have already done something similar. For more details,
please refer to retract_page_tables(). It only holds i_mmap_rwsem read
lock and then calls pte_free_defer() to free the PTE page through RCU.

For migrate case, the pte entry will store a migrate entry, right? And a
new physical page will be installed soon through a page fault, so I
don't think it is necessary to free the corresponding PTE page.

For reclaim case, there is a problem that only PTE entries that mapped
to a physical page are operated each time. If we want to free the entire
PTE page, we need to check the adjacent PTE entries. Maybe MGLRU can
help with this. I remember that MGLRU has an optimization that will 
check the adjacent PTE entries.

> 
>>
>> Here are some of my previous thoughts:
>>
>> ```
>> Another plan
>> ============
>>
>> Currently, page table modification are protected by page table locks
>> (page_table_lock or split pmd/pte lock), but the life cycle of page
>> table pages are protected by mmap_lock (and vma lock). For more details,
>> please refer to the latest added Documentation/mm/process_addrs.rst file.
>>
>> Currently we try to free the PTE pages through RCU when
>> CONFIG_PT_RECLAIM is turned on. In this case, we will no longer
>> need to hold mmap_lock for the read/write op on the PTE pages.
>>
>> So maybe we can remove the page table from the protection of the mmap
>> lock (which is too big), like this:
>>
>> 1. free all levels of page table pages by RCU, not just PTE pages, but
>>     also pmd, pud, etc.
>> 2. similar to pte_offset_map/pte_unmap, add
>>     [pmd|pud]_offset_map/[pmd|pud]_unmap, and make them all contain
>>     rcu_read_lock/rcu_read_unlcok, and make them accept failure.
>>
>> In this way, we no longer need the mmap lock. For readers, such as page
>> table wallers, we are already in the critical section of RCU. For
>> writers, we only need to hold the page table lock.
>>
>> But there is a difficulty here, that is, the RCU critical section is not
>> allowed to sleep, but it is possible to sleep in the callback function
>> of .pmd_entry, such as mmu_notifier_invalidate_range_start().
>>
>> Use SRCU instead? Or use RCU + refcount method? Not sure. But I think
>> it's an interesting thing to try.
> 
> Thanks for the information, RCU freeing of page tables is something of a

RCU-freeing is relatively simple, tlb_remove_table() can be easily
changed to free all levels of page table pages through RCU. The more
difficult is to protect the page table pages above PTE level through RCU
lock.

> long-term TODO discussed back and forth :) might take a look myself if
> somebody else hasn't grabbed when I have a second...

This is awesome, I'm stuck with some other stuff at the moment, I'll
also take a look at it later when I have time.

> 
> Is it _only_ the mmu notifier sleeping in this scenario? Or are there other
> examples?

I'm not sure, need some investigation.

> 
> We could in theory always add another callback .pmd_entry_sleep or
> something for this one case and document the requirement...

Maybe, but the SRCU critical section cannot prevent the PTE page from
being freed via RCU. :(

Thanks!

> 
>> ```
>>
>> Thanks!
>>
>>
> 
> Cheers, Lorenzo



^ permalink raw reply	[flat|nested] 35+ messages in thread

* Re: [PATCH RFC v2] mm: use per_vma lock for MADV_DONTNEED
  2025-06-03 20:17   ` Suren Baghdasaryan
  2025-06-04  5:22     ` Lorenzo Stoakes
@ 2025-06-06  7:18     ` Barry Song
  2025-06-06 10:16       ` Lorenzo Stoakes
  1 sibling, 1 reply; 35+ messages in thread
From: Barry Song @ 2025-06-06  7:18 UTC (permalink / raw)
  To: Suren Baghdasaryan
  Cc: Lorenzo Stoakes, akpm, linux-mm, linux-kernel, Barry Song,
	Liam R. Howlett, David Hildenbrand, Vlastimil Babka, Jann Horn,
	Lokesh Gidra, Tangquan Zheng

On Wed, Jun 4, 2025 at 8:17 AM Suren Baghdasaryan <surenb@google.com> wrote:
>
> On Tue, Jun 3, 2025 at 11:43 AM Lorenzo Stoakes
> <lorenzo.stoakes@oracle.com> wrote:
> >
> > Hi Barry,
> >
> > As promised, I enclose a patch to give a sense of how I think we might
> > thread state through this operation.
> >
> > There's a todo on the untagged stuff so you can figure that out. This is
> > based on the v1 so it might not encompass everything you addressed in the
> > v2.
> >
> > Passing in madv_behavior to madvise_walk_vmas() twice kinda sucks, I
> > _despise_ the void *arg function ptr stuff there added just for the anon
> > vma name stuff (ughhh) so might be the only sensible way of threading
> > state.
> >
> > I don't need any attribution, so please use this patch as you see
> > fit/adapt/delete/do whatever with it, just an easier way for me to show the
> > idea!
> >
> > I did some very basic testing and it seems to work, but nothing deeper.
> >
> > Cheers, Lorenzo

Really appreciate your work on this, Lorenzo.

> >
> > ----8<----
> > From ff4ba0115cb31a0630b6f8c02c68f11b3fb71f7a Mon Sep 17 00:00:00 2001
> > From: Lorenzo Stoakes <lorenzo.stoakes@oracle.com>
> > Date: Tue, 3 Jun 2025 18:22:55 +0100
> > Subject: [PATCH] mm/madvise: support VMA read locks for MADV_DONTNEED[_LOCKED]
> >
> > Refactor the madvise() code to retain state about the locking mode utilised
> > for traversing VMAs.
> >
> > Then use this mechanism to permit VMA locking to be done later in the
> > madvise() logic and also to allow altering of the locking mode to permit
> > falling back to an mmap read lock if required.
> >
> > Signed-off-by: Lorenzo Stoakes <lorenzo.stoakes@oracle.com>
> > ---
> >  mm/madvise.c | 174 +++++++++++++++++++++++++++++++++++++--------------
> >  1 file changed, 127 insertions(+), 47 deletions(-)
> >
> > diff --git a/mm/madvise.c b/mm/madvise.c
> > index 5f7a66a1617e..a3a6d73d0bd5 100644
> > --- a/mm/madvise.c
> > +++ b/mm/madvise.c
> > @@ -48,38 +48,19 @@ struct madvise_walk_private {
> >         bool pageout;
> >  };
> >
> > +enum madvise_lock_mode {
> > +       MADVISE_NO_LOCK,
> > +       MADVISE_MMAP_READ_LOCK,
> > +       MADVISE_MMAP_WRITE_LOCK,
> > +       MADVISE_VMA_READ_LOCK,
> > +};
> > +
> >  struct madvise_behavior {
> >         int behavior;
> >         struct mmu_gather *tlb;
> > +       enum madvise_lock_mode lock_mode;
> >  };
> >
> > -/*
> > - * Any behaviour which results in changes to the vma->vm_flags needs to
> > - * take mmap_lock for writing. Others, which simply traverse vmas, need
> > - * to only take it for reading.
> > - */
> > -static int madvise_need_mmap_write(int behavior)
> > -{
> > -       switch (behavior) {
> > -       case MADV_REMOVE:
> > -       case MADV_WILLNEED:
> > -       case MADV_DONTNEED:
> > -       case MADV_DONTNEED_LOCKED:
> > -       case MADV_COLD:
> > -       case MADV_PAGEOUT:
> > -       case MADV_FREE:
> > -       case MADV_POPULATE_READ:
> > -       case MADV_POPULATE_WRITE:
> > -       case MADV_COLLAPSE:
> > -       case MADV_GUARD_INSTALL:
> > -       case MADV_GUARD_REMOVE:
> > -               return 0;
> > -       default:
> > -               /* be safe, default to 1. list exceptions explicitly */
> > -               return 1;
> > -       }
> > -}
> > -
> >  #ifdef CONFIG_ANON_VMA_NAME
> >  struct anon_vma_name *anon_vma_name_alloc(const char *name)
> >  {
> > @@ -1486,6 +1467,43 @@ static bool process_madvise_remote_valid(int behavior)
> >         }
> >  }
> >
> > +/*
> > + * Try to acquire a VMA read lock if possible.
> > + *
> > + * We only support this lock over a single VMA, which the input range must
> > + * span.either partially or fully.
> > + *
> > + * This function always returns with an appropriate lock held. If a VMA read
> > + * lock could be acquired, we return the locked VMA.
> > + *
> > + * If a VMA read lock could not be acquired, we return NULL and expect caller to
>
> Worth mentioning that the function itself will fall back to taking
> mmap_read_lock in such a case.
>
> > + * fallback to mmap lock behaviour.
> > + */
> > +static struct vm_area_struct *try_vma_read_lock(struct mm_struct *mm,
> > +               struct madvise_behavior *madv_behavior,
> > +               unsigned long start, unsigned long end)
> > +{
> > +       struct vm_area_struct *vma;
> > +
> > +       if (!madv_behavior || madv_behavior->lock_mode != MADVISE_VMA_READ_LOCK)
>
> nit: I think it would be better to do this check before calling
> try_vma_read_lock(). IMHO it does not make sense to call
> try_vma_read_lock() when lock_mode != MADVISE_VMA_READ_LOCK. It also
> makes reading this function easier. The first time I looked at it and
> saw "return NULL" in one place that takes mmap_read_lock() and another
> place which returns the same NULL but does not take mmap_lock really
> confused me.
>
> > +               return NULL;
> > +
> > +       vma = lock_vma_under_rcu(mm, start);
> > +       if (!vma)
> > +               goto take_mmap_read_lock;
> > +       /* We must span only a single VMA, uffd unsupported. */
> > +       if (end > vma->vm_end || userfaultfd_armed(vma)) {
>
> vma->vm_end is not inclusive, so the above condition I think should be
> (end >= vma->vm_end || ...)

I don't quite understand why `end > vma->vm_end` would be a problem.
For a VMA with `vm_start = X` and `vm_end = X + 0x1000`, wouldn't the
range `(X, X + 0x1000)`—where `end == vm_end`—still be a valid
candidate for using a per-VMA lock?

We're checking which cases are not eligible for per-VMA locking,
not which ones are.

>
> > +               vma_end_read(vma);
> > +               goto take_mmap_read_lock;
> > +       }
> > +       return vma;
> > +
> > +take_mmap_read_lock:
> > +       mmap_read_lock(mm);
> > +       madv_behavior->lock_mode = MADVISE_MMAP_READ_LOCK;
> > +       return NULL;
> > +}
> > +
> >  /*
> >   * Walk the vmas in range [start,end), and call the visit function on each one.
> >   * The visit function will get start and end parameters that cover the overlap
> > @@ -1496,7 +1514,8 @@ static bool process_madvise_remote_valid(int behavior)
> >   */
> >  static
> >  int madvise_walk_vmas(struct mm_struct *mm, unsigned long start,
> > -                     unsigned long end, void *arg,
> > +                     unsigned long end, struct madvise_behavior *madv_behavior,
> > +                     void *arg,
> >                       int (*visit)(struct vm_area_struct *vma,
> >                                    struct vm_area_struct **prev, unsigned long start,
> >                                    unsigned long end, void *arg))
> > @@ -1505,6 +1524,15 @@ int madvise_walk_vmas(struct mm_struct *mm, unsigned long start,
> >         struct vm_area_struct *prev;
> >         unsigned long tmp;
> >         int unmapped_error = 0;
> > +       int error;
> > +
> > +       /* If VMA read lock supported, we apply advice to a single VMA only. */
> > +       vma = try_vma_read_lock(mm, madv_behavior, start, end);
> > +       if (vma) {
> > +               error = visit(vma, &prev, start, end, arg);
> > +               vma_end_read(vma);
> > +               return error;
> > +       }
> >
> >         /*
> >          * If the interval [start,end) covers some unmapped address
> > @@ -1516,8 +1544,6 @@ int madvise_walk_vmas(struct mm_struct *mm, unsigned long start,
> >                 prev = vma;
> >
> >         for (;;) {
> > -               int error;
> > -
> >                 /* Still start < end. */
> >                 if (!vma)
> >                         return -ENOMEM;
> > @@ -1598,34 +1624,86 @@ int madvise_set_anon_name(struct mm_struct *mm, unsigned long start,
> >         if (end == start)
> >                 return 0;
> >
> > -       return madvise_walk_vmas(mm, start, end, anon_name,
> > +       return madvise_walk_vmas(mm, start, end, anon_name, NULL,

I think this should be:

 +       return madvise_walk_vmas(mm, start, end, NULL, anon_name,

> >                                  madvise_vma_anon_name);
> >  }
> >  #endif /* CONFIG_ANON_VMA_NAME */
> >
> > -static int madvise_lock(struct mm_struct *mm, int behavior)
> > +
> > +/*
> > + * Any behaviour which results in changes to the vma->vm_flags needs to
> > + * take mmap_lock for writing. Others, which simply traverse vmas, need
> > + * to only take it for reading.
> > + */
> > +static enum madvise_lock_mode get_lock_mode(struct madvise_behavior *madv_behavior)
> >  {
> > +       int behavior = madv_behavior->behavior;
> > +
> >         if (is_memory_failure(behavior))
> > -               return 0;
> > +               return MADVISE_NO_LOCK;
> >
> > -       if (madvise_need_mmap_write(behavior)) {
> > +       switch (behavior) {
> > +       case MADV_REMOVE:
> > +       case MADV_WILLNEED:
> > +       case MADV_COLD:
> > +       case MADV_PAGEOUT:
> > +       case MADV_FREE:
> > +       case MADV_POPULATE_READ:
> > +       case MADV_POPULATE_WRITE:
> > +       case MADV_COLLAPSE:
> > +       case MADV_GUARD_INSTALL:
> > +       case MADV_GUARD_REMOVE:
> > +               return MADVISE_MMAP_READ_LOCK;
> > +       case MADV_DONTNEED:
> > +       case MADV_DONTNEED_LOCKED:
> > +               return MADVISE_VMA_READ_LOCK;
> > +       default:
> > +               return MADVISE_MMAP_WRITE_LOCK;
> > +       }
> > +}
> > +
> > +static int madvise_lock(struct mm_struct *mm,
> > +               struct madvise_behavior *madv_behavior)
> > +{
> > +       enum madvise_lock_mode lock_mode = get_lock_mode(madv_behavior);
> > +
> > +       switch (lock_mode) {
> > +       case MADVISE_NO_LOCK:
> > +               break;
> > +       case MADVISE_MMAP_WRITE_LOCK:
> >                 if (mmap_write_lock_killable(mm))
> >                         return -EINTR;
> > -       } else {
> > +               break;
> > +       case MADVISE_MMAP_READ_LOCK:
> >                 mmap_read_lock(mm);
> > +               break;
> > +       case MADVISE_VMA_READ_LOCK:
> > +               /* We will acquire the lock per-VMA in madvise_walk_vmas(). */
> > +               break;
> >         }
> > +
> > +       madv_behavior->lock_mode = lock_mode;
> >         return 0;
> >  }
> >
> > -static void madvise_unlock(struct mm_struct *mm, int behavior)
> > +static void madvise_unlock(struct mm_struct *mm,
> > +               struct madvise_behavior *madv_behavior)
> >  {
> > -       if (is_memory_failure(behavior))
> > +       switch (madv_behavior->lock_mode) {
> > +       case  MADVISE_NO_LOCK:
> >                 return;
> > -
> > -       if (madvise_need_mmap_write(behavior))
> > +       case MADVISE_MMAP_WRITE_LOCK:
> >                 mmap_write_unlock(mm);
> > -       else
> > +               break;
> > +       case MADVISE_MMAP_READ_LOCK:
> >                 mmap_read_unlock(mm);
> > +               break;
> > +       case MADVISE_VMA_READ_LOCK:
> > +               /* We will drop the lock per-VMA in madvise_walk_vmas(). */
> > +               break;
> > +       }
> > +
> > +       madv_behavior->lock_mode = MADVISE_NO_LOCK;
> >  }
> >
> >  static bool madvise_batch_tlb_flush(int behavior)
> > @@ -1721,6 +1799,8 @@ static int madvise_do_behavior(struct mm_struct *mm,
> >
> >         if (is_memory_failure(behavior))
> >                 return madvise_inject_error(behavior, start, start + len_in);
> > +
> > +       // TODO: handle untagged stuff here...
> >         start = untagged_addr(start); //untagged_addr_remote(mm, start);
> >         end = start + PAGE_ALIGN(len_in);
> >
> > @@ -1729,7 +1809,7 @@ static int madvise_do_behavior(struct mm_struct *mm,
> >                 error = madvise_populate(mm, start, end, behavior);
> >         else
> >                 error = madvise_walk_vmas(mm, start, end, madv_behavior,
> > -                                         madvise_vma_behavior);
> > +                                         madv_behavior, madvise_vma_behavior);
> >         blk_finish_plug(&plug);
> >         return error;
> >  }
> > @@ -1817,13 +1897,13 @@ int do_madvise(struct mm_struct *mm, unsigned long start, size_t len_in, int beh
> >
> >         if (madvise_should_skip(start, len_in, behavior, &error))
> >                 return error;
> > -       error = madvise_lock(mm, behavior);
> > +       error = madvise_lock(mm, &madv_behavior);
> >         if (error)
> >                 return error;
> >         madvise_init_tlb(&madv_behavior, mm);
> >         error = madvise_do_behavior(mm, start, len_in, &madv_behavior);
> >         madvise_finish_tlb(&madv_behavior);
> > -       madvise_unlock(mm, behavior);
> > +       madvise_unlock(mm, &madv_behavior);
> >
> >         return error;
> >  }
> > @@ -1847,7 +1927,7 @@ static ssize_t vector_madvise(struct mm_struct *mm, struct iov_iter *iter,
> >
> >         total_len = iov_iter_count(iter);
> >
> > -       ret = madvise_lock(mm, behavior);
> > +       ret = madvise_lock(mm, &madv_behavior);
> >         if (ret)
> >                 return ret;
> >         madvise_init_tlb(&madv_behavior, mm);
> > @@ -1880,8 +1960,8 @@ static ssize_t vector_madvise(struct mm_struct *mm, struct iov_iter *iter,
> >
> >                         /* Drop and reacquire lock to unwind race. */
> >                         madvise_finish_tlb(&madv_behavior);
> > -                       madvise_unlock(mm, behavior);
> > -                       ret = madvise_lock(mm, behavior);
> > +                       madvise_unlock(mm, &madv_behavior);
> > +                       ret = madvise_lock(mm, &madv_behavior);
> >                         if (ret)
> >                                 goto out;
> >                         madvise_init_tlb(&madv_behavior, mm);
> > @@ -1892,7 +1972,7 @@ static ssize_t vector_madvise(struct mm_struct *mm, struct iov_iter *iter,
> >                 iov_iter_advance(iter, iter_iov_len(iter));
> >         }
> >         madvise_finish_tlb(&madv_behavior);
> > -       madvise_unlock(mm, behavior);
> > +       madvise_unlock(mm, &madv_behavior);
> >
> >  out:
> >         ret = (total_len - iov_iter_count(iter)) ? : ret;
> > --
> > 2.49.0

Thanks
Barry


^ permalink raw reply	[flat|nested] 35+ messages in thread

* Re: [PATCH RFC v2] mm: use per_vma lock for MADV_DONTNEED
  2025-06-06  7:18     ` Barry Song
@ 2025-06-06 10:16       ` Lorenzo Stoakes
  0 siblings, 0 replies; 35+ messages in thread
From: Lorenzo Stoakes @ 2025-06-06 10:16 UTC (permalink / raw)
  To: Barry Song
  Cc: Suren Baghdasaryan, akpm, linux-mm, linux-kernel, Barry Song,
	Liam R. Howlett, David Hildenbrand, Vlastimil Babka, Jann Horn,
	Lokesh Gidra, Tangquan Zheng

On Fri, Jun 06, 2025 at 07:18:25PM +1200, Barry Song wrote:
> On Wed, Jun 4, 2025 at 8:17 AM Suren Baghdasaryan <surenb@google.com> wrote:
> >
> > On Tue, Jun 3, 2025 at 11:43 AM Lorenzo Stoakes
> > <lorenzo.stoakes@oracle.com> wrote:
> > >
> > > Hi Barry,
> > >
> > > As promised, I enclose a patch to give a sense of how I think we might
> > > thread state through this operation.
> > >
> > > There's a todo on the untagged stuff so you can figure that out. This is
> > > based on the v1 so it might not encompass everything you addressed in the
> > > v2.
> > >
> > > Passing in madv_behavior to madvise_walk_vmas() twice kinda sucks, I
> > > _despise_ the void *arg function ptr stuff there added just for the anon
> > > vma name stuff (ughhh) so might be the only sensible way of threading
> > > state.
> > >
> > > I don't need any attribution, so please use this patch as you see
> > > fit/adapt/delete/do whatever with it, just an easier way for me to show the
> > > idea!
> > >
> > > I did some very basic testing and it seems to work, but nothing deeper.
> > >
> > > Cheers, Lorenzo
>
> Really appreciate your work on this, Lorenzo.

No problem! Hopefully having it in code form makes things clearer on how I
felt we could thread state through here.

The truth is the madvise code, despite SJ's great efforts to improve it
(and indeed he has!), is still a bit of a mess so it's fiddly. More work to
do there I think!

Obviously you are replying to Suren below but I will be so bold as to
quickly give my thoughts here :P

>
> > >
> > > ----8<----
> > > From ff4ba0115cb31a0630b6f8c02c68f11b3fb71f7a Mon Sep 17 00:00:00 2001
> > > From: Lorenzo Stoakes <lorenzo.stoakes@oracle.com>
> > > Date: Tue, 3 Jun 2025 18:22:55 +0100
> > > Subject: [PATCH] mm/madvise: support VMA read locks for MADV_DONTNEED[_LOCKED]
> > >
> > > Refactor the madvise() code to retain state about the locking mode utilised
> > > for traversing VMAs.
> > >
> > > Then use this mechanism to permit VMA locking to be done later in the
> > > madvise() logic and also to allow altering of the locking mode to permit
> > > falling back to an mmap read lock if required.
> > >
> > > Signed-off-by: Lorenzo Stoakes <lorenzo.stoakes@oracle.com>
> > > ---
> > >  mm/madvise.c | 174 +++++++++++++++++++++++++++++++++++++--------------
> > >  1 file changed, 127 insertions(+), 47 deletions(-)
> > >
> > > diff --git a/mm/madvise.c b/mm/madvise.c
> > > index 5f7a66a1617e..a3a6d73d0bd5 100644
> > > --- a/mm/madvise.c
> > > +++ b/mm/madvise.c
> > > @@ -48,38 +48,19 @@ struct madvise_walk_private {
> > >         bool pageout;
> > >  };
> > >
> > > +enum madvise_lock_mode {
> > > +       MADVISE_NO_LOCK,
> > > +       MADVISE_MMAP_READ_LOCK,
> > > +       MADVISE_MMAP_WRITE_LOCK,
> > > +       MADVISE_VMA_READ_LOCK,
> > > +};
> > > +
> > >  struct madvise_behavior {
> > >         int behavior;
> > >         struct mmu_gather *tlb;
> > > +       enum madvise_lock_mode lock_mode;
> > >  };
> > >
> > > -/*
> > > - * Any behaviour which results in changes to the vma->vm_flags needs to
> > > - * take mmap_lock for writing. Others, which simply traverse vmas, need
> > > - * to only take it for reading.
> > > - */
> > > -static int madvise_need_mmap_write(int behavior)
> > > -{
> > > -       switch (behavior) {
> > > -       case MADV_REMOVE:
> > > -       case MADV_WILLNEED:
> > > -       case MADV_DONTNEED:
> > > -       case MADV_DONTNEED_LOCKED:
> > > -       case MADV_COLD:
> > > -       case MADV_PAGEOUT:
> > > -       case MADV_FREE:
> > > -       case MADV_POPULATE_READ:
> > > -       case MADV_POPULATE_WRITE:
> > > -       case MADV_COLLAPSE:
> > > -       case MADV_GUARD_INSTALL:
> > > -       case MADV_GUARD_REMOVE:
> > > -               return 0;
> > > -       default:
> > > -               /* be safe, default to 1. list exceptions explicitly */
> > > -               return 1;
> > > -       }
> > > -}
> > > -
> > >  #ifdef CONFIG_ANON_VMA_NAME
> > >  struct anon_vma_name *anon_vma_name_alloc(const char *name)
> > >  {
> > > @@ -1486,6 +1467,43 @@ static bool process_madvise_remote_valid(int behavior)
> > >         }
> > >  }
> > >
> > > +/*
> > > + * Try to acquire a VMA read lock if possible.
> > > + *
> > > + * We only support this lock over a single VMA, which the input range must
> > > + * span.either partially or fully.
> > > + *
> > > + * This function always returns with an appropriate lock held. If a VMA read
> > > + * lock could be acquired, we return the locked VMA.
> > > + *
> > > + * If a VMA read lock could not be acquired, we return NULL and expect caller to
> >
> > Worth mentioning that the function itself will fall back to taking
> > mmap_read_lock in such a case.
> >
> > > + * fallback to mmap lock behaviour.
> > > + */
> > > +static struct vm_area_struct *try_vma_read_lock(struct mm_struct *mm,
> > > +               struct madvise_behavior *madv_behavior,
> > > +               unsigned long start, unsigned long end)
> > > +{
> > > +       struct vm_area_struct *vma;
> > > +
> > > +       if (!madv_behavior || madv_behavior->lock_mode != MADVISE_VMA_READ_LOCK)
> >
> > nit: I think it would be better to do this check before calling
> > try_vma_read_lock(). IMHO it does not make sense to call
> > try_vma_read_lock() when lock_mode != MADVISE_VMA_READ_LOCK. It also
> > makes reading this function easier. The first time I looked at it and
> > saw "return NULL" in one place that takes mmap_read_lock() and another
> > place which returns the same NULL but does not take mmap_lock really
> > confused me.
> >
> > > +               return NULL;
> > > +
> > > +       vma = lock_vma_under_rcu(mm, start);
> > > +       if (!vma)
> > > +               goto take_mmap_read_lock;
> > > +       /* We must span only a single VMA, uffd unsupported. */
> > > +       if (end > vma->vm_end || userfaultfd_armed(vma)) {
> >
> > vma->vm_end is not inclusive, so the above condition I think should be
> > (end >= vma->vm_end || ...)
>
> I don't quite understand why `end > vma->vm_end` would be a problem.
> For a VMA with `vm_start = X` and `vm_end = X + 0x1000`, wouldn't the
> range `(X, X + 0x1000)`—where `end == vm_end`—still be a valid
> candidate for using a per-VMA lock?
>
> We're checking which cases are not eligible for per-VMA locking,
> not which ones are.

Yeah agreed.

>
> >
> > > +               vma_end_read(vma);
> > > +               goto take_mmap_read_lock;
> > > +       }
> > > +       return vma;
> > > +
> > > +take_mmap_read_lock:
> > > +       mmap_read_lock(mm);
> > > +       madv_behavior->lock_mode = MADVISE_MMAP_READ_LOCK;
> > > +       return NULL;
> > > +}
> > > +
> > >  /*
> > >   * Walk the vmas in range [start,end), and call the visit function on each one.
> > >   * The visit function will get start and end parameters that cover the overlap
> > > @@ -1496,7 +1514,8 @@ static bool process_madvise_remote_valid(int behavior)
> > >   */
> > >  static
> > >  int madvise_walk_vmas(struct mm_struct *mm, unsigned long start,
> > > -                     unsigned long end, void *arg,
> > > +                     unsigned long end, struct madvise_behavior *madv_behavior,
> > > +                     void *arg,
> > >                       int (*visit)(struct vm_area_struct *vma,
> > >                                    struct vm_area_struct **prev, unsigned long start,
> > >                                    unsigned long end, void *arg))
> > > @@ -1505,6 +1524,15 @@ int madvise_walk_vmas(struct mm_struct *mm, unsigned long start,
> > >         struct vm_area_struct *prev;
> > >         unsigned long tmp;
> > >         int unmapped_error = 0;
> > > +       int error;
> > > +
> > > +       /* If VMA read lock supported, we apply advice to a single VMA only. */
> > > +       vma = try_vma_read_lock(mm, madv_behavior, start, end);
> > > +       if (vma) {
> > > +               error = visit(vma, &prev, start, end, arg);
> > > +               vma_end_read(vma);
> > > +               return error;
> > > +       }
> > >
> > >         /*
> > >          * If the interval [start,end) covers some unmapped address
> > > @@ -1516,8 +1544,6 @@ int madvise_walk_vmas(struct mm_struct *mm, unsigned long start,
> > >                 prev = vma;
> > >
> > >         for (;;) {
> > > -               int error;
> > > -
> > >                 /* Still start < end. */
> > >                 if (!vma)
> > >                         return -ENOMEM;
> > > @@ -1598,34 +1624,86 @@ int madvise_set_anon_name(struct mm_struct *mm, unsigned long start,
> > >         if (end == start)
> > >                 return 0;
> > >
> > > -       return madvise_walk_vmas(mm, start, end, anon_name,
> > > +       return madvise_walk_vmas(mm, start, end, anon_name, NULL,
>
> I think this should be:
>
>  +       return madvise_walk_vmas(mm, start, end, NULL, anon_name,

Whoops! :) you're right!

I kind of hate having to pass the state like this but I guess for now we'll
live with it and can refactor this whole thing later perhaps.

>
> > >                                  madvise_vma_anon_name);
> > >  }
> > >  #endif /* CONFIG_ANON_VMA_NAME */
> > >
> > > -static int madvise_lock(struct mm_struct *mm, int behavior)
> > > +
> > > +/*
> > > + * Any behaviour which results in changes to the vma->vm_flags needs to
> > > + * take mmap_lock for writing. Others, which simply traverse vmas, need
> > > + * to only take it for reading.
> > > + */
> > > +static enum madvise_lock_mode get_lock_mode(struct madvise_behavior *madv_behavior)
> > >  {
> > > +       int behavior = madv_behavior->behavior;
> > > +
> > >         if (is_memory_failure(behavior))
> > > -               return 0;
> > > +               return MADVISE_NO_LOCK;
> > >
> > > -       if (madvise_need_mmap_write(behavior)) {
> > > +       switch (behavior) {
> > > +       case MADV_REMOVE:
> > > +       case MADV_WILLNEED:
> > > +       case MADV_COLD:
> > > +       case MADV_PAGEOUT:
> > > +       case MADV_FREE:
> > > +       case MADV_POPULATE_READ:
> > > +       case MADV_POPULATE_WRITE:
> > > +       case MADV_COLLAPSE:
> > > +       case MADV_GUARD_INSTALL:
> > > +       case MADV_GUARD_REMOVE:
> > > +               return MADVISE_MMAP_READ_LOCK;
> > > +       case MADV_DONTNEED:
> > > +       case MADV_DONTNEED_LOCKED:
> > > +               return MADVISE_VMA_READ_LOCK;
> > > +       default:
> > > +               return MADVISE_MMAP_WRITE_LOCK;
> > > +       }
> > > +}
> > > +
> > > +static int madvise_lock(struct mm_struct *mm,
> > > +               struct madvise_behavior *madv_behavior)
> > > +{
> > > +       enum madvise_lock_mode lock_mode = get_lock_mode(madv_behavior);
> > > +
> > > +       switch (lock_mode) {
> > > +       case MADVISE_NO_LOCK:
> > > +               break;
> > > +       case MADVISE_MMAP_WRITE_LOCK:
> > >                 if (mmap_write_lock_killable(mm))
> > >                         return -EINTR;
> > > -       } else {
> > > +               break;
> > > +       case MADVISE_MMAP_READ_LOCK:
> > >                 mmap_read_lock(mm);
> > > +               break;
> > > +       case MADVISE_VMA_READ_LOCK:
> > > +               /* We will acquire the lock per-VMA in madvise_walk_vmas(). */
> > > +               break;
> > >         }
> > > +
> > > +       madv_behavior->lock_mode = lock_mode;
> > >         return 0;
> > >  }
> > >
> > > -static void madvise_unlock(struct mm_struct *mm, int behavior)
> > > +static void madvise_unlock(struct mm_struct *mm,
> > > +               struct madvise_behavior *madv_behavior)
> > >  {
> > > -       if (is_memory_failure(behavior))
> > > +       switch (madv_behavior->lock_mode) {
> > > +       case  MADVISE_NO_LOCK:
> > >                 return;
> > > -
> > > -       if (madvise_need_mmap_write(behavior))
> > > +       case MADVISE_MMAP_WRITE_LOCK:
> > >                 mmap_write_unlock(mm);
> > > -       else
> > > +               break;
> > > +       case MADVISE_MMAP_READ_LOCK:
> > >                 mmap_read_unlock(mm);
> > > +               break;
> > > +       case MADVISE_VMA_READ_LOCK:
> > > +               /* We will drop the lock per-VMA in madvise_walk_vmas(). */
> > > +               break;
> > > +       }
> > > +
> > > +       madv_behavior->lock_mode = MADVISE_NO_LOCK;
> > >  }
> > >
> > >  static bool madvise_batch_tlb_flush(int behavior)
> > > @@ -1721,6 +1799,8 @@ static int madvise_do_behavior(struct mm_struct *mm,
> > >
> > >         if (is_memory_failure(behavior))
> > >                 return madvise_inject_error(behavior, start, start + len_in);
> > > +
> > > +       // TODO: handle untagged stuff here...
> > >         start = untagged_addr(start); //untagged_addr_remote(mm, start);
> > >         end = start + PAGE_ALIGN(len_in);
> > >
> > > @@ -1729,7 +1809,7 @@ static int madvise_do_behavior(struct mm_struct *mm,
> > >                 error = madvise_populate(mm, start, end, behavior);
> > >         else
> > >                 error = madvise_walk_vmas(mm, start, end, madv_behavior,
> > > -                                         madvise_vma_behavior);
> > > +                                         madv_behavior, madvise_vma_behavior);
> > >         blk_finish_plug(&plug);
> > >         return error;
> > >  }
> > > @@ -1817,13 +1897,13 @@ int do_madvise(struct mm_struct *mm, unsigned long start, size_t len_in, int beh
> > >
> > >         if (madvise_should_skip(start, len_in, behavior, &error))
> > >                 return error;
> > > -       error = madvise_lock(mm, behavior);
> > > +       error = madvise_lock(mm, &madv_behavior);
> > >         if (error)
> > >                 return error;
> > >         madvise_init_tlb(&madv_behavior, mm);
> > >         error = madvise_do_behavior(mm, start, len_in, &madv_behavior);
> > >         madvise_finish_tlb(&madv_behavior);
> > > -       madvise_unlock(mm, behavior);
> > > +       madvise_unlock(mm, &madv_behavior);
> > >
> > >         return error;
> > >  }
> > > @@ -1847,7 +1927,7 @@ static ssize_t vector_madvise(struct mm_struct *mm, struct iov_iter *iter,
> > >
> > >         total_len = iov_iter_count(iter);
> > >
> > > -       ret = madvise_lock(mm, behavior);
> > > +       ret = madvise_lock(mm, &madv_behavior);
> > >         if (ret)
> > >                 return ret;
> > >         madvise_init_tlb(&madv_behavior, mm);
> > > @@ -1880,8 +1960,8 @@ static ssize_t vector_madvise(struct mm_struct *mm, struct iov_iter *iter,
> > >
> > >                         /* Drop and reacquire lock to unwind race. */
> > >                         madvise_finish_tlb(&madv_behavior);
> > > -                       madvise_unlock(mm, behavior);
> > > -                       ret = madvise_lock(mm, behavior);
> > > +                       madvise_unlock(mm, &madv_behavior);
> > > +                       ret = madvise_lock(mm, &madv_behavior);
> > >                         if (ret)
> > >                                 goto out;
> > >                         madvise_init_tlb(&madv_behavior, mm);
> > > @@ -1892,7 +1972,7 @@ static ssize_t vector_madvise(struct mm_struct *mm, struct iov_iter *iter,
> > >                 iov_iter_advance(iter, iter_iov_len(iter));
> > >         }
> > >         madvise_finish_tlb(&madv_behavior);
> > > -       madvise_unlock(mm, behavior);
> > > +       madvise_unlock(mm, &madv_behavior);
> > >
> > >  out:
> > >         ret = (total_len - iov_iter_count(iter)) ? : ret;
> > > --
> > > 2.49.0
>
> Thanks
> Barry


^ permalink raw reply	[flat|nested] 35+ messages in thread

* Re: [PATCH RFC v2] mm: use per_vma lock for MADV_DONTNEED
  2025-06-06  3:55               ` Qi Zheng
@ 2025-06-06 10:44                 ` Lorenzo Stoakes
  2025-06-09  6:40                   ` Qi Zheng
  0 siblings, 1 reply; 35+ messages in thread
From: Lorenzo Stoakes @ 2025-06-06 10:44 UTC (permalink / raw)
  To: Qi Zheng
  Cc: Jann Horn, Barry Song, akpm, linux-mm, linux-kernel, Barry Song,
	Liam R. Howlett, David Hildenbrand, Vlastimil Babka,
	Suren Baghdasaryan, Lokesh Gidra, Tangquan Zheng

Qi - thanks very much for your insightful response, will stash this
somewhere to revisit.

On Fri, Jun 06, 2025 at 11:55:56AM +0800, Qi Zheng wrote:
> Hi Lorenzo,
>
> On 6/5/25 10:04 PM, Lorenzo Stoakes wrote:
> > On Thu, Jun 05, 2025 at 11:23:18AM +0800, Qi Zheng wrote:
> > >
> > >
> > > On 6/5/25 1:50 AM, Lorenzo Stoakes wrote:
> > > > On Wed, Jun 04, 2025 at 02:02:12PM +0800, Qi Zheng wrote:
> > > > > Hi Lorenzo,
> > > > >
> > > > > On 6/3/25 5:54 PM, Lorenzo Stoakes wrote:
> > > > > > On Tue, Jun 03, 2025 at 03:24:28PM +0800, Qi Zheng wrote:
> > > > > > > Hi Jann,
> > > > > > >
> > > > > > > On 5/30/25 10:06 PM, Jann Horn wrote:
> > > > > > > > On Fri, May 30, 2025 at 12:44 PM Barry Song <21cnbao@gmail.com> wrote:
> > > > > > > > > Certain madvise operations, especially MADV_DONTNEED, occur far more
> > > > > > > > > frequently than other madvise options, particularly in native and Java
> > > > > > > > > heaps for dynamic memory management.
> > > > > > > > >
> > > > > > > > > Currently, the mmap_lock is always held during these operations, even when
> > > > > > > > > unnecessary. This causes lock contention and can lead to severe priority
> > > > > > > > > inversion, where low-priority threads—such as Android's HeapTaskDaemon—
> > > > > > > > > hold the lock and block higher-priority threads.
> > > > > > > > >
> > > > > > > > > This patch enables the use of per-VMA locks when the advised range lies
> > > > > > > > > entirely within a single VMA, avoiding the need for full VMA traversal. In
> > > > > > > > > practice, userspace heaps rarely issue MADV_DONTNEED across multiple VMAs.
> > > > > > > > >
> > > > > > > > > Tangquan’s testing shows that over 99.5% of memory reclaimed by Android
> > > > > > > > > benefits from this per-VMA lock optimization. After extended runtime,
> > > > > > > > > 217,735 madvise calls from HeapTaskDaemon used the per-VMA path, while
> > > > > > > > > only 1,231 fell back to mmap_lock.
> > > > > > > > >
> > > > > > > > > To simplify handling, the implementation falls back to the standard
> > > > > > > > > mmap_lock if userfaultfd is enabled on the VMA, avoiding the complexity of
> > > > > > > > > userfaultfd_remove().
> > > > > > > >
> > > > > > > > One important quirk of this is that it can, from what I can see, cause
> > > > > > > > freeing of page tables (through pt_reclaim) without holding the mmap
> > > > > > > > lock at all:
> > > > > > > >
> > > > > > > > do_madvise [behavior=MADV_DONTNEED]
> > > > > > > >       madvise_lock
> > > > > > > >         lock_vma_under_rcu
> > > > > > > >       madvise_do_behavior
> > > > > > > >         madvise_single_locked_vma
> > > > > > > >           madvise_vma_behavior
> > > > > > > >             madvise_dontneed_free
> > > > > > > >               madvise_dontneed_single_vma
> > > > > > > >                 zap_page_range_single_batched [.reclaim_pt = true]
> > > > > > > >                   unmap_single_vma
> > > > > > > >                     unmap_page_range
> > > > > > > >                       zap_p4d_range
> > > > > > > >                         zap_pud_range
> > > > > > > >                           zap_pmd_range
> > > > > > > >                             zap_pte_range
> > > > > > > >                               try_get_and_clear_pmd
> > > > > > > >                               free_pte
> > > > > > > >
> > > > > > > > This clashes with the assumption in walk_page_range_novma() that
> > > > > > > > holding the mmap lock in write mode is sufficient to prevent
> > > > > > > > concurrent page table freeing, so it can probably lead to page table
> > > > > > > > UAF through the ptdump interface (see ptdump_walk_pgd()).
> > > > > > >
> > > > > > > Maybe not? The PTE page is freed via RCU in zap_pte_range(), so in the
> > > > > > > following case:
> > > > > > >
> > > > > > > cpu 0				cpu 1
> > > > > > >
> > > > > > > ptdump_walk_pgd
> > > > > > > --> walk_pte_range
> > > > > > >        --> pte_offset_map (hold RCU read lock)
> > > > > > > 				zap_pte_range
> > > > > > > 				--> free_pte (via RCU)
> > > > > > >            walk_pte_range_inner
> > > > > > >            --> ptdump_pte_entry (the PTE page is not freed at this time)
> > > > > > >
> > > > > > > IIUC, there is no UAF issue here?
> > > > > > >
> > > > > > > If I missed anything please let me know.
> > > >
> > > > Seems to me that we don't need the VMA locks then unless I'm missing
> > > > something? :) Jann?
> > > >
> > > > Would this RCU-lock-acquired-by-pte_offset_map also save us from the
> > > > munmap() downgraded read lock scenario also? Or is the problem there
> > > > intermediate page table teardown I guess?
> > > >
> > >
> > > Right. Currently, page table pages other than PTE pages are not
> > > protected by RCU, so mmap write lock still needed in the munmap path
> > > to wait for all readers of the page table pages to exit the critical
> > > section.
> > >
> > > In other words, once we have achieved that all page table pages are
> > > protected by RCU, we can completely remove the page table pages from
> > > the protection of mmap locks.
> >
> > Interesting - so on reclaim/migrate we are just clearing PTE entries with
> > the rmap lock right? Would this lead to a future where we could also tear
> > down page tables there?
> >
> > Another point to remember is that when we are clearing down higher level
> > page tables in the general case, the logic assumes nothing else can touch
> > anything... we hold both rmap lock AND mmap/vma locks at this point.
> >
> > But I guess if we're RCU-safe, we're same even from rmap right?
>
> Yeah, and we have already done something similar. For more details,
> please refer to retract_page_tables(). It only holds i_mmap_rwsem read
> lock and then calls pte_free_defer() to free the PTE page through RCU.

Yeah, but that i_mmap_rwsem is important :) as it protects against other
rmap users.

Interesting that we only do this for shmem case not anon...

>
> For migrate case, the pte entry will store a migrate entry, right? And a
> new physical page will be installed soon through a page fault, so I
> don't think it is necessary to free the corresponding PTE page.

Yeah.

>
> For reclaim case, there is a problem that only PTE entries that mapped
> to a physical page are operated each time. If we want to free the entire
> PTE page, we need to check the adjacent PTE entries. Maybe MGLRU can
> help with this. I remember that MGLRU has an optimization that will check
> the adjacent PTE entries.

Yeah indeed, we'd need to take the 'very simple' reclaim code and have it
do _even more_ in this case :P

>
> >
> > >
> > > Here are some of my previous thoughts:
> > >
> > > ```
> > > Another plan
> > > ============
> > >
> > > Currently, page table modification are protected by page table locks
> > > (page_table_lock or split pmd/pte lock), but the life cycle of page
> > > table pages are protected by mmap_lock (and vma lock). For more details,
> > > please refer to the latest added Documentation/mm/process_addrs.rst file.
> > >
> > > Currently we try to free the PTE pages through RCU when
> > > CONFIG_PT_RECLAIM is turned on. In this case, we will no longer
> > > need to hold mmap_lock for the read/write op on the PTE pages.
> > >
> > > So maybe we can remove the page table from the protection of the mmap
> > > lock (which is too big), like this:
> > >
> > > 1. free all levels of page table pages by RCU, not just PTE pages, but
> > >     also pmd, pud, etc.
> > > 2. similar to pte_offset_map/pte_unmap, add
> > >     [pmd|pud]_offset_map/[pmd|pud]_unmap, and make them all contain
> > >     rcu_read_lock/rcu_read_unlcok, and make them accept failure.
> > >
> > > In this way, we no longer need the mmap lock. For readers, such as page
> > > table wallers, we are already in the critical section of RCU. For
> > > writers, we only need to hold the page table lock.
> > >
> > > But there is a difficulty here, that is, the RCU critical section is not
> > > allowed to sleep, but it is possible to sleep in the callback function
> > > of .pmd_entry, such as mmu_notifier_invalidate_range_start().
> > >
> > > Use SRCU instead? Or use RCU + refcount method? Not sure. But I think
> > > it's an interesting thing to try.
> >
> > Thanks for the information, RCU freeing of page tables is something of a
>
> RCU-freeing is relatively simple, tlb_remove_table() can be easily
> changed to free all levels of page table pages through RCU. The more
> difficult is to protect the page table pages above PTE level through RCU
> lock.
>
> > long-term TODO discussed back and forth :) might take a look myself if
> > somebody else hasn't grabbed when I have a second...
>
> This is awesome, I'm stuck with some other stuff at the moment, I'll
> also take a look at it later when I have time.

Yeah, I guess good to ping on-list if/when one of us/somebody else takes a
look to synchronise :)

>
> >
> > Is it _only_ the mmu notifier sleeping in this scenario? Or are there other
> > examples?
>
> I'm not sure, need some investigation.
>
> >
> > We could in theory always add another callback .pmd_entry_sleep or
> > something for this one case and document the requirement...
>
> Maybe, but the SRCU critical section cannot prevent the PTE page from
> being freed via RCU. :(

Idea is we'd fall back to non-RCU in this case and take locks... but then
ugh we'd race everything RCU and no it's all or nothing isn't it?

Overall - I will stash this response somewhere and come back to it if
somebody else doesn't in the meantime :)

>
> Thanks!
>
> >
> > > ```
> > >
> > > Thanks!
> > >
> > >
> >
> > Cheers, Lorenzo
>


^ permalink raw reply	[flat|nested] 35+ messages in thread

* Re: [PATCH RFC v2] mm: use per_vma lock for MADV_DONTNEED
  2025-06-04 17:50         ` Lorenzo Stoakes
  2025-06-05  3:23           ` Qi Zheng
@ 2025-06-06 11:07           ` Jann Horn
  1 sibling, 0 replies; 35+ messages in thread
From: Jann Horn @ 2025-06-06 11:07 UTC (permalink / raw)
  To: Lorenzo Stoakes
  Cc: Qi Zheng, Barry Song, akpm, linux-mm, linux-kernel, Barry Song,
	Liam R. Howlett, David Hildenbrand, Vlastimil Babka,
	Suren Baghdasaryan, Lokesh Gidra, Tangquan Zheng

On Wed, Jun 4, 2025 at 7:50 PM Lorenzo Stoakes
<lorenzo.stoakes@oracle.com> wrote:
> On Wed, Jun 04, 2025 at 02:02:12PM +0800, Qi Zheng wrote:
> > Hi Lorenzo,
> >
> > On 6/3/25 5:54 PM, Lorenzo Stoakes wrote:
> > > On Tue, Jun 03, 2025 at 03:24:28PM +0800, Qi Zheng wrote:
> > > > Hi Jann,
> > > >
> > > > On 5/30/25 10:06 PM, Jann Horn wrote:
> > > > > One important quirk of this is that it can, from what I can see, cause
> > > > > freeing of page tables (through pt_reclaim) without holding the mmap
> > > > > lock at all:
> > > > >
> > > > > do_madvise [behavior=MADV_DONTNEED]
> > > > >     madvise_lock
> > > > >       lock_vma_under_rcu
> > > > >     madvise_do_behavior
> > > > >       madvise_single_locked_vma
> > > > >         madvise_vma_behavior
> > > > >           madvise_dontneed_free
> > > > >             madvise_dontneed_single_vma
> > > > >               zap_page_range_single_batched [.reclaim_pt = true]
> > > > >                 unmap_single_vma
> > > > >                   unmap_page_range
> > > > >                     zap_p4d_range
> > > > >                       zap_pud_range
> > > > >                         zap_pmd_range
> > > > >                           zap_pte_range
> > > > >                             try_get_and_clear_pmd
> > > > >                             free_pte
> > > > >
> > > > > This clashes with the assumption in walk_page_range_novma() that
> > > > > holding the mmap lock in write mode is sufficient to prevent
> > > > > concurrent page table freeing, so it can probably lead to page table
> > > > > UAF through the ptdump interface (see ptdump_walk_pgd()).
> > > >
> > > > Maybe not? The PTE page is freed via RCU in zap_pte_range(), so in the
> > > > following case:
> > > >
> > > > cpu 0                             cpu 1
> > > >
> > > > ptdump_walk_pgd
> > > > --> walk_pte_range
> > > >      --> pte_offset_map (hold RCU read lock)
> > > >                           zap_pte_range
> > > >                           --> free_pte (via RCU)
> > > >          walk_pte_range_inner
> > > >          --> ptdump_pte_entry (the PTE page is not freed at this time)
> > > >
> > > > IIUC, there is no UAF issue here?
> > > >
> > > > If I missed anything please let me know.
>
> Seems to me that we don't need the VMA locks then unless I'm missing
> something? :) Jann?

Aah, right, this is one of those paths that use RCU to protect
read-only PTE-level page table access that can tolerate seeing stale
data. Sorry about the confusion.

> Would this RCU-lock-acquired-by-pte_offset_map also save us from the
> munmap() downgraded read lock scenario also? Or is the problem there
> intermediate page table teardown I guess?

(I see Qi Zheng already clarified this part.)


^ permalink raw reply	[flat|nested] 35+ messages in thread

* Re: [PATCH RFC v2] mm: use per_vma lock for MADV_DONTNEED
  2025-06-06 10:44                 ` Lorenzo Stoakes
@ 2025-06-09  6:40                   ` Qi Zheng
  2025-06-09 15:08                     ` Lorenzo Stoakes
  2025-06-10  7:20                     ` David Hildenbrand
  0 siblings, 2 replies; 35+ messages in thread
From: Qi Zheng @ 2025-06-09  6:40 UTC (permalink / raw)
  To: Lorenzo Stoakes
  Cc: Jann Horn, Barry Song, akpm, linux-mm, linux-kernel, Barry Song,
	Liam R. Howlett, David Hildenbrand, Vlastimil Babka,
	Suren Baghdasaryan, Lokesh Gidra, Tangquan Zheng

Hi Lorenzo,

On 6/6/25 6:44 PM, Lorenzo Stoakes wrote:

[snip]

>>
>>>
>>> We could in theory always add another callback .pmd_entry_sleep or
>>> something for this one case and document the requirement...
>>
>> Maybe, but the SRCU critical section cannot prevent the PTE page from
>> being freed via RCU. :(
> 
> Idea is we'd fall back to non-RCU in this case and take locks... but then
> ugh we'd race everything RCU and no it's all or nothing isn't it?

So maybe the RCU+refcount method is feasible. We can release the RCU
lock after incrementing the reference count, which can ensure that the
page table page is not freed.

> 
> Overall - I will stash this response somewhere and come back to it if
> somebody else doesn't in the meantime :)

Thanks!

> 
>>




^ permalink raw reply	[flat|nested] 35+ messages in thread

* Re: [PATCH RFC v2] mm: use per_vma lock for MADV_DONTNEED
  2025-06-09  6:40                   ` Qi Zheng
@ 2025-06-09 15:08                     ` Lorenzo Stoakes
  2025-06-10  7:20                     ` David Hildenbrand
  1 sibling, 0 replies; 35+ messages in thread
From: Lorenzo Stoakes @ 2025-06-09 15:08 UTC (permalink / raw)
  To: Qi Zheng
  Cc: Jann Horn, Barry Song, akpm, linux-mm, linux-kernel, Barry Song,
	Liam R. Howlett, David Hildenbrand, Vlastimil Babka,
	Suren Baghdasaryan, Lokesh Gidra, Tangquan Zheng

On Mon, Jun 09, 2025 at 02:40:09PM +0800, Qi Zheng wrote:
> Hi Lorenzo,
>
> On 6/6/25 6:44 PM, Lorenzo Stoakes wrote:
>
> [snip]
>
> > >
> > > >
> > > > We could in theory always add another callback .pmd_entry_sleep or
> > > > something for this one case and document the requirement...
> > >
> > > Maybe, but the SRCU critical section cannot prevent the PTE page from
> > > being freed via RCU. :(
> >
> > Idea is we'd fall back to non-RCU in this case and take locks... but then
> > ugh we'd race everything RCU and no it's all or nothing isn't it?
>
> So maybe the RCU+refcount method is feasible. We can release the RCU
> lock after incrementing the reference count, which can ensure that the
> page table page is not freed.

Oh interesting... will note down for when I take a look should you not get to it
first :>)

>
> >
> > Overall - I will stash this response somewhere and come back to it if
> > somebody else doesn't in the meantime :)
>
> Thanks!
>
> >
> > >
>
>

Cheers, Lorenzo


^ permalink raw reply	[flat|nested] 35+ messages in thread

* Re: [PATCH RFC v2] mm: use per_vma lock for MADV_DONTNEED
  2025-06-09  6:40                   ` Qi Zheng
  2025-06-09 15:08                     ` Lorenzo Stoakes
@ 2025-06-10  7:20                     ` David Hildenbrand
  1 sibling, 0 replies; 35+ messages in thread
From: David Hildenbrand @ 2025-06-10  7:20 UTC (permalink / raw)
  To: Qi Zheng, Lorenzo Stoakes
  Cc: Jann Horn, Barry Song, akpm, linux-mm, linux-kernel, Barry Song,
	Liam R. Howlett, Vlastimil Babka, Suren Baghdasaryan,
	Lokesh Gidra, Tangquan Zheng

On 09.06.25 08:40, Qi Zheng wrote:
> Hi Lorenzo,
> 
> On 6/6/25 6:44 PM, Lorenzo Stoakes wrote:
> 
> [snip]
> 
>>>
>>>>
>>>> We could in theory always add another callback .pmd_entry_sleep or
>>>> something for this one case and document the requirement...
>>>
>>> Maybe, but the SRCU critical section cannot prevent the PTE page from
>>> being freed via RCU. :(
>>
>> Idea is we'd fall back to non-RCU in this case and take locks... but then
>> ugh we'd race everything RCU and no it's all or nothing isn't it?
> 
> So maybe the RCU+refcount method is feasible. We can release the RCU
> lock after incrementing the reference count, which can ensure that the
> page table page is not freed.

I'll not that maybe after the memdesc rework, page tables will no longer 
have a refcount. Maybe.

I mean, it kind-of makes sense, because nobody should really be taking 
references on page table.

So finding something that doesn't depend on page-table refcounts might 
be better.

-- 
Cheers,

David / dhildenb



^ permalink raw reply	[flat|nested] 35+ messages in thread

end of thread, other threads:[~2025-06-10  7:20 UTC | newest]

Thread overview: 35+ messages (download: mbox.gz follow: Atom feed
-- links below jump to the message on this page --
2025-05-30 10:44 [PATCH RFC v2] mm: use per_vma lock for MADV_DONTNEED Barry Song
2025-05-30 14:06 ` Jann Horn
2025-05-30 14:34   ` Lorenzo Stoakes
2025-05-30 20:17     ` Barry Song
2025-06-02 17:35       ` SeongJae Park
2025-06-02 17:53         ` SeongJae Park
2025-05-30 20:40     ` Jann Horn
2025-06-02 11:50       ` Lorenzo Stoakes
2025-06-03  1:06         ` Barry Song
2025-06-03  9:48           ` Lorenzo Stoakes
2025-06-03  7:06       ` Barry Song
2025-06-03 16:52         ` Jann Horn
2025-06-05 10:27           ` Barry Song
2025-05-30 22:00   ` Barry Song
2025-06-02 14:55     ` Jann Horn
2025-06-03  7:51       ` Barry Song
2025-06-03  7:24   ` Qi Zheng
2025-06-03  9:54     ` Lorenzo Stoakes
2025-06-04  6:02       ` Qi Zheng
2025-06-04 17:50         ` Lorenzo Stoakes
2025-06-05  3:23           ` Qi Zheng
2025-06-05 14:04             ` Lorenzo Stoakes
2025-06-06  3:55               ` Qi Zheng
2025-06-06 10:44                 ` Lorenzo Stoakes
2025-06-09  6:40                   ` Qi Zheng
2025-06-09 15:08                     ` Lorenzo Stoakes
2025-06-10  7:20                     ` David Hildenbrand
2025-06-06 11:07           ` Jann Horn
2025-06-03 18:43 ` Lorenzo Stoakes
2025-06-03 20:17   ` Suren Baghdasaryan
2025-06-04  5:22     ` Lorenzo Stoakes
2025-06-06  7:18     ` Barry Song
2025-06-06 10:16       ` Lorenzo Stoakes
2025-06-03 20:59   ` Pedro Falcato
2025-06-04  5:23     ` Lorenzo Stoakes

This is a public inbox, see mirroring instructions
for how to clone and mirror all data and code used for this inbox;
as well as URLs for NNTP newsgroup(s).