Linux-mm Archive on lore.kernel.org
 help / color / mirror / Atom feed
* [PATCH mm-new 0/5] mm: mincore: reduce lock contention and misc cleanups
@ 2026-07-01 14:40 Kefeng Wang
  2026-07-01 14:40 ` [PATCH mm-new 1/5] mm: mincore: attempt per-vma lock during page table walk Kefeng Wang
                   ` (5 more replies)
  0 siblings, 6 replies; 8+ messages in thread
From: Kefeng Wang @ 2026-07-01 14:40 UTC (permalink / raw)
  To: Andrew Morton, linux-mm
  Cc: Liam R. Howlett, Lorenzo Stoakes, Vlastimil Babka, Jann Horn,
	Pedro Falcato, David Hildenbrand, Zi Yan, Kefeng Wang

Kefeng Wang (5):
  mm: mincore: attempt per-vma lock during page table walk
  mm: mincore: remove special handing for VM_PFNMAP
  mm: mincore: replace __get_free_page() with kmalloc()
  mm: mincore: remove xa_is_value() in mincore_swap()
  mm: mincore: improve mincore_hugetlb()

 mm/mincore.c | 96 +++++++++++++++++++++++++---------------------------
 1 file changed, 47 insertions(+), 49 deletions(-)

-- 
2.27.0



^ permalink raw reply	[flat|nested] 8+ messages in thread

* [PATCH mm-new 1/5] mm: mincore: attempt per-vma lock during page table walk
  2026-07-01 14:40 [PATCH mm-new 0/5] mm: mincore: reduce lock contention and misc cleanups Kefeng Wang
@ 2026-07-01 14:40 ` Kefeng Wang
  2026-07-01 14:40 ` [PATCH mm-new 2/5] mm: mincore: remove special handing for VM_PFNMAP Kefeng Wang
                   ` (4 subsequent siblings)
  5 siblings, 0 replies; 8+ messages in thread
From: Kefeng Wang @ 2026-07-01 14:40 UTC (permalink / raw)
  To: Andrew Morton, linux-mm
  Cc: Liam R. Howlett, Lorenzo Stoakes, Vlastimil Babka, Jann Horn,
	Pedro Falcato, David Hildenbrand, Zi Yan, Kefeng Wang

The do_mincore() is a read-only query of page residency from a single
VMA, making it an ideal candidate for per-vma locking. Try per-vma lock
first and fallback to mmap_lock, reducing global contention with page
faults and other mmap_lock holders in multi-threaded applications.                                        

Signed-off-by: Kefeng Wang <wangkefeng.wang@huawei.com>
---
 mm/mincore.c | 53 +++++++++++++++++++++++++++++++++++-----------------
 1 file changed, 36 insertions(+), 17 deletions(-)

diff --git a/mm/mincore.c b/mm/mincore.c
index 866b26cec540..f561d6e01052 100644
--- a/mm/mincore.c
+++ b/mm/mincore.c
@@ -12,6 +12,7 @@
 #include <linux/gfp.h>
 #include <linux/pagewalk.h>
 #include <linux/mman.h>
+#include <linux/mmap_lock.h>
 #include <linux/syscalls.h>
 #include <linux/swap.h>
 #include <linux/leafops.h>
@@ -232,27 +233,24 @@ static inline bool can_do_mincore(struct vm_area_struct *vma)
 	       file_permission(vma->vm_file, MAY_WRITE) == 0;
 }
 
-static const struct mm_walk_ops mincore_walk_ops = {
-	.pmd_entry		= mincore_pte_range,
-	.pte_hole		= mincore_unmapped_range,
-	.hugetlb_entry		= mincore_hugetlb,
-	.walk_lock		= PGWALK_RDLOCK,
-};
-
 /*
  * Do a chunk of "sys_mincore()". We've already checked
- * all the arguments, we hold the mmap semaphore: we should
- * just return the amount of info we're asked for.
+ * all the arguments, we should just return the amount of
+ * info we're asked for.
  */
-static long do_mincore(unsigned long addr, unsigned long pages, unsigned char *vec)
+static long do_mincore(struct vm_area_struct *vma, unsigned long addr,
+		unsigned long pages, unsigned char *vec,
+		enum page_walk_lock walk_lock)
 {
-	struct vm_area_struct *vma;
 	unsigned long end;
 	int err;
+	struct mm_walk_ops mincore_walk_ops = {
+		.pmd_entry		= mincore_pte_range,
+		.pte_hole		= mincore_unmapped_range,
+		.hugetlb_entry		= mincore_hugetlb,
+		.walk_lock		= walk_lock,
+	};
 
-	vma = vma_lookup(current->mm, addr);
-	if (!vma)
-		return -ENOMEM;
 	end = min(vma->vm_end, addr + (pages << PAGE_SHIFT));
 	if (!can_do_mincore(vma)) {
 		unsigned long pages = DIV_ROUND_UP(end - addr, PAGE_SIZE);
@@ -328,13 +326,34 @@ SYSCALL_DEFINE3(mincore, unsigned long, start, size_t, len,
 
 	retval = 0;
 	while (pages) {
+		struct mm_struct *mm = current->mm;
+		enum page_walk_lock walk_lock;
+		struct vm_area_struct *vma;
+
+		vma = lock_vma_under_rcu(mm, start);
+		if (vma) {
+			walk_lock = PGWALK_VMA_RDLOCK_VERIFY;
+		} else {
+			mmap_read_lock(mm);
+			vma = vma_lookup(mm, start);
+			if (!vma) {
+				mmap_read_unlock(mm);
+				retval = -ENOMEM;
+				break;
+			}
+			walk_lock = PGWALK_RDLOCK;
+		}
+
 		/*
 		 * Do at most PAGE_SIZE entries per iteration, due to
 		 * the temporary buffer size.
 		 */
-		mmap_read_lock(current->mm);
-		retval = do_mincore(start, min(pages, PAGE_SIZE), tmp);
-		mmap_read_unlock(current->mm);
+		retval = do_mincore(vma, start, min(pages, PAGE_SIZE), tmp, walk_lock);
+
+		if (walk_lock == PGWALK_VMA_RDLOCK_VERIFY)
+			vma_end_read(vma);
+		else
+			mmap_read_unlock(mm);
 
 		if (retval <= 0)
 			break;
-- 
2.27.0



^ permalink raw reply related	[flat|nested] 8+ messages in thread

* [PATCH mm-new 2/5] mm: mincore: remove special handing for VM_PFNMAP
  2026-07-01 14:40 [PATCH mm-new 0/5] mm: mincore: reduce lock contention and misc cleanups Kefeng Wang
  2026-07-01 14:40 ` [PATCH mm-new 1/5] mm: mincore: attempt per-vma lock during page table walk Kefeng Wang
@ 2026-07-01 14:40 ` Kefeng Wang
  2026-07-01 14:40 ` [PATCH mm-new 3/5] mm: mincore: replace __get_free_page() with kmalloc() Kefeng Wang
                   ` (3 subsequent siblings)
  5 siblings, 0 replies; 8+ messages in thread
From: Kefeng Wang @ 2026-07-01 14:40 UTC (permalink / raw)
  To: Andrew Morton, linux-mm
  Cc: Liam R. Howlett, Lorenzo Stoakes, Vlastimil Babka, Jann Horn,
	Pedro Falcato, David Hildenbrand, Zi Yan, Kefeng Wang

As David pointed out, "it’s hard to believe that someone depends
on pages in VM_PFNMAP to *not* be present", so remove the historical
behavior that always reports VM_PFNMAP pages as non-resident,
simplifying the code.

Link: https://lore.kernel.org/linux-mm/0e619d71-1c3d-4534-8376-2982c7348c31@kernel.org/
Suggested-by: David Hildenbrand (Arm) <david@kernel.org>
Signed-off-by: Kefeng Wang <wangkefeng.wang@huawei.com>
---
 mm/mincore.c | 8 --------
 1 file changed, 8 deletions(-)

diff --git a/mm/mincore.c b/mm/mincore.c
index f561d6e01052..6374819dd5c1 100644
--- a/mm/mincore.c
+++ b/mm/mincore.c
@@ -258,14 +258,6 @@ static long do_mincore(struct vm_area_struct *vma, unsigned long addr,
 		return pages;
 	}
 
-	/*
-	 * mincore historically reports PFNMAP mappings as non-resident.
-	 */
-	if (vma->vm_flags & VM_PFNMAP) {
-		__mincore_unmapped_range(addr, end, vma, vec);
-		return (end - addr) >> PAGE_SHIFT;
-	}
-
 	err = walk_page_range_vma(vma, addr, end, &mincore_walk_ops, vec);
 	if (err < 0)
 		return err;
-- 
2.27.0



^ permalink raw reply related	[flat|nested] 8+ messages in thread

* [PATCH mm-new 3/5] mm: mincore: replace __get_free_page() with kmalloc()
  2026-07-01 14:40 [PATCH mm-new 0/5] mm: mincore: reduce lock contention and misc cleanups Kefeng Wang
  2026-07-01 14:40 ` [PATCH mm-new 1/5] mm: mincore: attempt per-vma lock during page table walk Kefeng Wang
  2026-07-01 14:40 ` [PATCH mm-new 2/5] mm: mincore: remove special handing for VM_PFNMAP Kefeng Wang
@ 2026-07-01 14:40 ` Kefeng Wang
  2026-07-01 18:26   ` Andrew Morton
  2026-07-01 14:40 ` [PATCH mm-new 4/5] mm: mincore: remove xa_is_value() in mincore_swap() Kefeng Wang
                   ` (2 subsequent siblings)
  5 siblings, 1 reply; 8+ messages in thread
From: Kefeng Wang @ 2026-07-01 14:40 UTC (permalink / raw)
  To: Andrew Morton, linux-mm
  Cc: Liam R. Howlett, Lorenzo Stoakes, Vlastimil Babka, Jann Horn,
	Pedro Falcato, David Hildenbrand, Zi Yan, Kefeng Wang

Remove ugly casts by using the more natural kmalloc/kfree allocation.

Signed-off-by: Kefeng Wang <wangkefeng.wang@huawei.com>
---
 mm/mincore.c | 5 +++--
 1 file changed, 3 insertions(+), 2 deletions(-)

diff --git a/mm/mincore.c b/mm/mincore.c
index 6374819dd5c1..d561f1e5c461 100644
--- a/mm/mincore.c
+++ b/mm/mincore.c
@@ -13,6 +13,7 @@
 #include <linux/pagewalk.h>
 #include <linux/mman.h>
 #include <linux/mmap_lock.h>
+#include <linux/slab.h>
 #include <linux/syscalls.h>
 #include <linux/swap.h>
 #include <linux/leafops.h>
@@ -312,7 +313,7 @@ SYSCALL_DEFINE3(mincore, unsigned long, start, size_t, len,
 	if (!access_ok(vec, pages))
 		return -EFAULT;
 
-	tmp = (void *) __get_free_page(GFP_USER);
+	tmp = kmalloc(PAGE_SIZE, GFP_USER);
 	if (!tmp)
 		return -EAGAIN;
 
@@ -358,6 +359,6 @@ SYSCALL_DEFINE3(mincore, unsigned long, start, size_t, len,
 		start += retval << PAGE_SHIFT;
 		retval = 0;
 	}
-	free_page((unsigned long) tmp);
+	kfree(tmp);
 	return retval;
 }
-- 
2.27.0



^ permalink raw reply related	[flat|nested] 8+ messages in thread

* [PATCH mm-new 4/5] mm: mincore: remove xa_is_value() in mincore_swap()
  2026-07-01 14:40 [PATCH mm-new 0/5] mm: mincore: reduce lock contention and misc cleanups Kefeng Wang
                   ` (2 preceding siblings ...)
  2026-07-01 14:40 ` [PATCH mm-new 3/5] mm: mincore: replace __get_free_page() with kmalloc() Kefeng Wang
@ 2026-07-01 14:40 ` Kefeng Wang
  2026-07-01 14:40 ` [PATCH mm-new 5/5] mm: mincore: improve mincore_hugetlb() Kefeng Wang
  2026-07-01 18:24 ` [PATCH mm-new 0/5] mm: mincore: reduce lock contention and misc cleanups Andrew Morton
  5 siblings, 0 replies; 8+ messages in thread
From: Kefeng Wang @ 2026-07-01 14:40 UTC (permalink / raw)
  To: Andrew Morton, linux-mm
  Cc: Liam R. Howlett, Lorenzo Stoakes, Vlastimil Babka, Jann Horn,
	Pedro Falcato, David Hildenbrand, Zi Yan, Kefeng Wang

The swap_cache_get_folio() no longer returns shadow entries, so the 
xa_is_value() check is unnecessary.

Signed-off-by: Kefeng Wang <wangkefeng.wang@huawei.com>
---
 mm/mincore.c | 3 +--
 1 file changed, 1 insertion(+), 2 deletions(-)

diff --git a/mm/mincore.c b/mm/mincore.c
index d561f1e5c461..1c72cc3ac87b 100644
--- a/mm/mincore.c
+++ b/mm/mincore.c
@@ -92,8 +92,7 @@ static unsigned char mincore_swap(swp_entry_t entry, bool shmem)
 	folio = swap_cache_get_folio(entry);
 	if (shmem)
 		put_swap_device(si);
-	/* The swap cache space contains either folio, shadow or NULL */
-	if (folio && !xa_is_value(folio)) {
+	if (folio) {
 		present = folio_test_uptodate(folio);
 		folio_put(folio);
 	}
-- 
2.27.0



^ permalink raw reply related	[flat|nested] 8+ messages in thread

* [PATCH mm-new 5/5] mm: mincore: improve mincore_hugetlb()
  2026-07-01 14:40 [PATCH mm-new 0/5] mm: mincore: reduce lock contention and misc cleanups Kefeng Wang
                   ` (3 preceding siblings ...)
  2026-07-01 14:40 ` [PATCH mm-new 4/5] mm: mincore: remove xa_is_value() in mincore_swap() Kefeng Wang
@ 2026-07-01 14:40 ` Kefeng Wang
  2026-07-01 18:24 ` [PATCH mm-new 0/5] mm: mincore: reduce lock contention and misc cleanups Andrew Morton
  5 siblings, 0 replies; 8+ messages in thread
From: Kefeng Wang @ 2026-07-01 14:40 UTC (permalink / raw)
  To: Andrew Morton, linux-mm
  Cc: Liam R. Howlett, Lorenzo Stoakes, Vlastimil Babka, Jann Horn,
	Pedro Falcato, David Hildenbrand, Zi Yan, Kefeng Wang

The walk_hugetlb_range() always passes a non-NULL pte, so remove the   
dead NULL check. Replace the per-page iteration loop with memset() 
for better performance. 

Signed-off-by: Kefeng Wang <wangkefeng.wang@huawei.com>
---
 mm/mincore.c | 27 +++++++--------------------
 1 file changed, 7 insertions(+), 20 deletions(-)

diff --git a/mm/mincore.c b/mm/mincore.c
index 1c72cc3ac87b..1a2568734e34 100644
--- a/mm/mincore.c
+++ b/mm/mincore.c
@@ -29,30 +29,17 @@ static int mincore_hugetlb(pte_t *pte, unsigned long hmask, unsigned long addr,
 			unsigned long end, struct mm_walk *walk)
 {
 #ifdef CONFIG_HUGETLB_PAGE
-	unsigned char present;
+	unsigned long nr = (end - addr) >> PAGE_SHIFT;
 	unsigned char *vec = walk->private;
+	unsigned char present;
 	spinlock_t *ptl;
+	pte_t ptep;
 
 	ptl = huge_pte_lock(hstate_vma(walk->vma), walk->mm, pte);
-
-	/*
-	 * Hugepages under user process are always in RAM and never
-	 * swapped out, but theoretically it needs to be checked.
-	 */
-	if (!pte) {
-		present = 0;
-	} else {
-		const pte_t ptep = huge_ptep_get(walk->mm, addr, pte);
-
-		if (huge_pte_none(ptep) || pte_is_marker(ptep))
-			present = 0;
-		else
-			present = 1;
-	}
-
-	for (; addr != end; vec++, addr += PAGE_SIZE)
-		*vec = present;
-	walk->private = vec;
+	ptep = huge_ptep_get(walk->mm, addr, pte);
+	present = !(huge_pte_none(ptep) || pte_is_marker(ptep));
+	memset(vec, present, nr);
+	walk->private += nr;
 	spin_unlock(ptl);
 #else
 	BUG();
-- 
2.27.0



^ permalink raw reply related	[flat|nested] 8+ messages in thread

* Re: [PATCH mm-new 0/5] mm: mincore: reduce lock contention and misc cleanups
  2026-07-01 14:40 [PATCH mm-new 0/5] mm: mincore: reduce lock contention and misc cleanups Kefeng Wang
                   ` (4 preceding siblings ...)
  2026-07-01 14:40 ` [PATCH mm-new 5/5] mm: mincore: improve mincore_hugetlb() Kefeng Wang
@ 2026-07-01 18:24 ` Andrew Morton
  5 siblings, 0 replies; 8+ messages in thread
From: Andrew Morton @ 2026-07-01 18:24 UTC (permalink / raw)
  To: Kefeng Wang
  Cc: linux-mm, Liam R. Howlett, Lorenzo Stoakes, Vlastimil Babka,
	Jann Horn, Pedro Falcato, David Hildenbrand, Zi Yan

On Wed, 1 Jul 2026 22:40:42 +0800 Kefeng Wang <wangkefeng.wang@huawei.com> wrote:

> Kefeng Wang (5):
>   mm: mincore: attempt per-vma lock during page table walk
>   mm: mincore: remove special handing for VM_PFNMAP
>   mm: mincore: replace __get_free_page() with kmalloc()
>   mm: mincore: remove xa_is_value() in mincore_swap()
>   mm: mincore: improve mincore_hugetlb()
> 
>  mm/mincore.c | 96 +++++++++++++++++++++++++---------------------------
>  1 file changed, 47 insertions(+), 49 deletions(-)

That was brief ;)

Thanks, I'll await reviewer input on this series.  AI review said a
couple of things:
	https://sashiko.dev/#/patchset/20260701144047.3786939-1-wangkefeng.wang@huawei.com


^ permalink raw reply	[flat|nested] 8+ messages in thread

* Re: [PATCH mm-new 3/5] mm: mincore: replace __get_free_page() with kmalloc()
  2026-07-01 14:40 ` [PATCH mm-new 3/5] mm: mincore: replace __get_free_page() with kmalloc() Kefeng Wang
@ 2026-07-01 18:26   ` Andrew Morton
  0 siblings, 0 replies; 8+ messages in thread
From: Andrew Morton @ 2026-07-01 18:26 UTC (permalink / raw)
  To: Kefeng Wang
  Cc: linux-mm, Liam R. Howlett, Lorenzo Stoakes, Vlastimil Babka,
	Jann Horn, Pedro Falcato, David Hildenbrand, Zi Yan

On Wed, 1 Jul 2026 22:40:45 +0800 Kefeng Wang <wangkefeng.wang@huawei.com> wrote:

> Remove ugly casts by using the more natural kmalloc/kfree allocation.
> 
> --- a/mm/mincore.c
> +++ b/mm/mincore.c
> @@ -13,6 +13,7 @@
>  #include <linux/pagewalk.h>
>  #include <linux/mman.h>
>  #include <linux/mmap_lock.h>
> +#include <linux/slab.h>
>  #include <linux/syscalls.h>
>  #include <linux/swap.h>
>  #include <linux/leafops.h>
> @@ -312,7 +313,7 @@ SYSCALL_DEFINE3(mincore, unsigned long, start, size_t, len,
>  	if (!access_ok(vec, pages))
>  		return -EFAULT;
>  
> -	tmp = (void *) __get_free_page(GFP_USER);
> +	tmp = kmalloc(PAGE_SIZE, GFP_USER);

kmalloc(GFP_USER) is a weird combination.  I don't think GFP_USER ever
made sense in here.

>  	if (!tmp)
>  		return -EAGAIN;
>  
> @@ -358,6 +359,6 @@ SYSCALL_DEFINE3(mincore, unsigned long, start, size_t, len,
>  		start += retval << PAGE_SHIFT;
>  		retval = 0;
>  	}
> -	free_page((unsigned long) tmp);
> +	kfree(tmp);
>  	return retval;
>  }



^ permalink raw reply	[flat|nested] 8+ messages in thread

end of thread, other threads:[~2026-07-01 18:26 UTC | newest]

Thread overview: 8+ messages (download: mbox.gz follow: Atom feed
-- links below jump to the message on this page --
2026-07-01 14:40 [PATCH mm-new 0/5] mm: mincore: reduce lock contention and misc cleanups Kefeng Wang
2026-07-01 14:40 ` [PATCH mm-new 1/5] mm: mincore: attempt per-vma lock during page table walk Kefeng Wang
2026-07-01 14:40 ` [PATCH mm-new 2/5] mm: mincore: remove special handing for VM_PFNMAP Kefeng Wang
2026-07-01 14:40 ` [PATCH mm-new 3/5] mm: mincore: replace __get_free_page() with kmalloc() Kefeng Wang
2026-07-01 18:26   ` Andrew Morton
2026-07-01 14:40 ` [PATCH mm-new 4/5] mm: mincore: remove xa_is_value() in mincore_swap() Kefeng Wang
2026-07-01 14:40 ` [PATCH mm-new 5/5] mm: mincore: improve mincore_hugetlb() Kefeng Wang
2026-07-01 18:24 ` [PATCH mm-new 0/5] mm: mincore: reduce lock contention and misc cleanups Andrew Morton

This is a public inbox, see mirroring instructions
for how to clone and mirror all data and code used for this inbox