linux-kernel.vger.kernel.org archive mirror
 help / color / mirror / Atom feed
* [PATCH RFC v2] mm: use per_vma lock for MADV_DONTNEED
@ 2025-05-30 10:44 Barry Song
  2025-05-30 14:06 ` Jann Horn
  2025-06-03 18:43 ` Lorenzo Stoakes
  0 siblings, 2 replies; 35+ messages in thread
From: Barry Song @ 2025-05-30 10:44 UTC (permalink / raw)
  To: akpm, linux-mm
  Cc: linux-kernel, Barry Song, Liam R. Howlett, Lorenzo Stoakes,
	David Hildenbrand, Vlastimil Babka, Jann Horn, Suren Baghdasaryan,
	Lokesh Gidra, Tangquan Zheng

From: Barry Song <v-songbaohua@oppo.com>

Certain madvise operations, especially MADV_DONTNEED, occur far more
frequently than other madvise options, particularly in native and Java
heaps for dynamic memory management.

Currently, the mmap_lock is always held during these operations, even when
unnecessary. This causes lock contention and can lead to severe priority
inversion, where low-priority threads—such as Android's HeapTaskDaemon—
hold the lock and block higher-priority threads.

This patch enables the use of per-VMA locks when the advised range lies
entirely within a single VMA, avoiding the need for full VMA traversal. In
practice, userspace heaps rarely issue MADV_DONTNEED across multiple VMAs.

Tangquan’s testing shows that over 99.5% of memory reclaimed by Android
benefits from this per-VMA lock optimization. After extended runtime,
217,735 madvise calls from HeapTaskDaemon used the per-VMA path, while
only 1,231 fell back to mmap_lock.

To simplify handling, the implementation falls back to the standard
mmap_lock if userfaultfd is enabled on the VMA, avoiding the complexity of
userfaultfd_remove().

Cc: "Liam R. Howlett" <Liam.Howlett@oracle.com>
Cc: Lorenzo Stoakes <lorenzo.stoakes@oracle.com>
Cc: David Hildenbrand <david@redhat.com>
Cc: Vlastimil Babka <vbabka@suse.cz>
Cc: Jann Horn <jannh@google.com>
Cc: Suren Baghdasaryan <surenb@google.com>
Cc: Lokesh Gidra <lokeshgidra@google.com>
Cc: Tangquan Zheng <zhengtangquan@oppo.com>
Signed-off-by: Barry Song <v-songbaohua@oppo.com>
---
 -v2:
 * try to hide the per-vma lock in madvise_lock, per Lorenzo;
 * ideally, for vector_madvise(), we are able to make the
   decision of lock types for each iteration; for this moment,
   we still use the global lock.
 -v1:
 https://lore.kernel.org/linux-mm/20250527044145.13153-1-21cnbao@gmail.com/

 mm/madvise.c | 79 ++++++++++++++++++++++++++++++++++++++++++++++------
 1 file changed, 70 insertions(+), 9 deletions(-)

diff --git a/mm/madvise.c b/mm/madvise.c
index 8433ac9b27e0..d408ffa404b3 100644
--- a/mm/madvise.c
+++ b/mm/madvise.c
@@ -51,6 +51,7 @@ struct madvise_walk_private {
 struct madvise_behavior {
 	int behavior;
 	struct mmu_gather *tlb;
+	struct vm_area_struct *vma;
 };
 
 /*
@@ -1553,6 +1554,21 @@ int madvise_walk_vmas(struct mm_struct *mm, unsigned long start,
 	return unmapped_error;
 }
 
+/*
+ * Call the visit function on the single vma with the per_vma lock
+ */
+static inline
+int madvise_single_locked_vma(struct vm_area_struct *vma,
+		unsigned long start, unsigned long end, void *arg,
+		int (*visit)(struct vm_area_struct *vma,
+				   struct vm_area_struct **prev, unsigned long start,
+				   unsigned long end, void *arg))
+{
+	struct vm_area_struct *prev;
+
+	return visit(vma, &prev, start, end, arg);
+}
+
 #ifdef CONFIG_ANON_VMA_NAME
 static int madvise_vma_anon_name(struct vm_area_struct *vma,
 				 struct vm_area_struct **prev,
@@ -1603,7 +1619,7 @@ int madvise_set_anon_name(struct mm_struct *mm, unsigned long start,
 }
 #endif /* CONFIG_ANON_VMA_NAME */
 
-static int madvise_lock(struct mm_struct *mm, int behavior)
+static int __madvise_lock(struct mm_struct *mm, int behavior)
 {
 	if (is_memory_failure(behavior))
 		return 0;
@@ -1617,7 +1633,7 @@ static int madvise_lock(struct mm_struct *mm, int behavior)
 	return 0;
 }
 
-static void madvise_unlock(struct mm_struct *mm, int behavior)
+static void __madvise_unlock(struct mm_struct *mm, int behavior)
 {
 	if (is_memory_failure(behavior))
 		return;
@@ -1628,6 +1644,46 @@ static void madvise_unlock(struct mm_struct *mm, int behavior)
 		mmap_read_unlock(mm);
 }
 
+static int madvise_lock(struct mm_struct *mm, unsigned long start,
+		unsigned long len, struct madvise_behavior *madv_behavior)
+{
+	int behavior = madv_behavior->behavior;
+
+	/*
+	 * MADV_DONTNEED is commonly used with userspace heaps and most often
+	 * affects a single VMA. In these cases, we can use per-VMA locks to
+	 * reduce contention on the mmap_lock.
+	 */
+	if (behavior == MADV_DONTNEED || behavior == MADV_DONTNEED_LOCKED) {
+		struct vm_area_struct *vma;
+		unsigned long end;
+
+		start = untagged_addr(start);
+		end = start + len;
+		vma = lock_vma_under_rcu(mm, start);
+		if (!vma)
+			goto out;
+		if (end > vma->vm_end || userfaultfd_armed(vma)) {
+			vma_end_read(vma);
+			goto out;
+		}
+		madv_behavior->vma = vma;
+		return 0;
+	}
+
+out:
+	return __madvise_lock(mm, behavior);
+}
+
+static void madvise_unlock(struct mm_struct *mm,
+		struct madvise_behavior *madv_behavior)
+{
+	if (madv_behavior->vma)
+		vma_end_read(madv_behavior->vma);
+	else
+		__madvise_unlock(mm, madv_behavior->behavior);
+}
+
 static bool madvise_batch_tlb_flush(int behavior)
 {
 	switch (behavior) {
@@ -1714,19 +1770,24 @@ static int madvise_do_behavior(struct mm_struct *mm,
 		unsigned long start, size_t len_in,
 		struct madvise_behavior *madv_behavior)
 {
+	struct vm_area_struct *vma = madv_behavior->vma;
 	int behavior = madv_behavior->behavior;
+
 	struct blk_plug plug;
 	unsigned long end;
 	int error;
 
 	if (is_memory_failure(behavior))
 		return madvise_inject_error(behavior, start, start + len_in);
-	start = untagged_addr_remote(mm, start);
+	start = untagged_addr(start);
 	end = start + PAGE_ALIGN(len_in);
 
 	blk_start_plug(&plug);
 	if (is_madvise_populate(behavior))
 		error = madvise_populate(mm, start, end, behavior);
+	else if (vma)
+		error = madvise_single_locked_vma(vma, start, end,
+				madv_behavior, madvise_vma_behavior);
 	else
 		error = madvise_walk_vmas(mm, start, end, madv_behavior,
 					  madvise_vma_behavior);
@@ -1817,13 +1878,13 @@ int do_madvise(struct mm_struct *mm, unsigned long start, size_t len_in, int beh
 
 	if (madvise_should_skip(start, len_in, behavior, &error))
 		return error;
-	error = madvise_lock(mm, behavior);
+	error = madvise_lock(mm, start, len_in, &madv_behavior);
 	if (error)
 		return error;
 	madvise_init_tlb(&madv_behavior, mm);
 	error = madvise_do_behavior(mm, start, len_in, &madv_behavior);
 	madvise_finish_tlb(&madv_behavior);
-	madvise_unlock(mm, behavior);
+	madvise_unlock(mm, &madv_behavior);
 
 	return error;
 }
@@ -1847,7 +1908,7 @@ static ssize_t vector_madvise(struct mm_struct *mm, struct iov_iter *iter,
 
 	total_len = iov_iter_count(iter);
 
-	ret = madvise_lock(mm, behavior);
+	ret = __madvise_lock(mm, behavior);
 	if (ret)
 		return ret;
 	madvise_init_tlb(&madv_behavior, mm);
@@ -1880,8 +1941,8 @@ static ssize_t vector_madvise(struct mm_struct *mm, struct iov_iter *iter,
 
 			/* Drop and reacquire lock to unwind race. */
 			madvise_finish_tlb(&madv_behavior);
-			madvise_unlock(mm, behavior);
-			madvise_lock(mm, behavior);
+			__madvise_unlock(mm, behavior);
+			__madvise_lock(mm, behavior);
 			madvise_init_tlb(&madv_behavior, mm);
 			continue;
 		}
@@ -1890,7 +1951,7 @@ static ssize_t vector_madvise(struct mm_struct *mm, struct iov_iter *iter,
 		iov_iter_advance(iter, iter_iov_len(iter));
 	}
 	madvise_finish_tlb(&madv_behavior);
-	madvise_unlock(mm, behavior);
+	__madvise_unlock(mm, behavior);
 
 	ret = (total_len - iov_iter_count(iter)) ? : ret;
 
-- 
2.39.3 (Apple Git-146)


^ permalink raw reply related	[flat|nested] 35+ messages in thread

end of thread, other threads:[~2025-06-10  7:20 UTC | newest]

Thread overview: 35+ messages (download: mbox.gz follow: Atom feed
-- links below jump to the message on this page --
2025-05-30 10:44 [PATCH RFC v2] mm: use per_vma lock for MADV_DONTNEED Barry Song
2025-05-30 14:06 ` Jann Horn
2025-05-30 14:34   ` Lorenzo Stoakes
2025-05-30 20:17     ` Barry Song
2025-06-02 17:35       ` SeongJae Park
2025-06-02 17:53         ` SeongJae Park
2025-05-30 20:40     ` Jann Horn
2025-06-02 11:50       ` Lorenzo Stoakes
2025-06-03  1:06         ` Barry Song
2025-06-03  9:48           ` Lorenzo Stoakes
2025-06-03  7:06       ` Barry Song
2025-06-03 16:52         ` Jann Horn
2025-06-05 10:27           ` Barry Song
2025-05-30 22:00   ` Barry Song
2025-06-02 14:55     ` Jann Horn
2025-06-03  7:51       ` Barry Song
2025-06-03  7:24   ` Qi Zheng
2025-06-03  9:54     ` Lorenzo Stoakes
2025-06-04  6:02       ` Qi Zheng
2025-06-04 17:50         ` Lorenzo Stoakes
2025-06-05  3:23           ` Qi Zheng
2025-06-05 14:04             ` Lorenzo Stoakes
2025-06-06  3:55               ` Qi Zheng
2025-06-06 10:44                 ` Lorenzo Stoakes
2025-06-09  6:40                   ` Qi Zheng
2025-06-09 15:08                     ` Lorenzo Stoakes
2025-06-10  7:20                     ` David Hildenbrand
2025-06-06 11:07           ` Jann Horn
2025-06-03 18:43 ` Lorenzo Stoakes
2025-06-03 20:17   ` Suren Baghdasaryan
2025-06-04  5:22     ` Lorenzo Stoakes
2025-06-06  7:18     ` Barry Song
2025-06-06 10:16       ` Lorenzo Stoakes
2025-06-03 20:59   ` Pedro Falcato
2025-06-04  5:23     ` Lorenzo Stoakes

This is a public inbox, see mirroring instructions
for how to clone and mirror all data and code used for this inbox;
as well as URLs for NNTP newsgroup(s).