[PATCH] userfaultfd: opportunistic TLB-flush batching for present pages in MOVE

linux-mm.kvack.org archive mirror
 help / color / mirror / Atom feed

* [PATCH] userfaultfd: opportunistic TLB-flush batching for present pages in MOVE
@ 2025-07-31 10:47 Lokesh Gidra
  2025-08-05  4:35 ` Barry Song
  0 siblings, 1 reply; 5+ messages in thread
From: Lokesh Gidra @ 2025-07-31 10:47 UTC (permalink / raw)
  To: akpm
  Cc: aarcange, linux-mm, linux-kernel, 21cnbao, ngeoffray,
	Lokesh Gidra, Suren Baghdasaryan, Kalesh Singh, Barry Song,
	David Hildenbrand, Peter Xu

MOVE ioctl's runtime is dominated by TLB-flush cost, which is required
for moving present pages. Mitigate this cost by opportunistically
batching present contiguous pages for TLB flushing.

Without batching, in our testing on an arm64 Android device with UFFD GC,
which uses MOVE ioctl for compaction, we observed that out of the total
time spent in move_pages_pte(), over 40% is in ptep_clear_flush(), and
~20% in vm_normal_folio().

With batching, the proportion of vm_normal_folio() increases to over
70% of move_pages_pte() without any changes to vm_normal_folio().
Furthermore, time spent within move_pages_pte() is only ~20%, which
includes TLB-flush overhead.

Cc: Suren Baghdasaryan <surenb@google.com>
Cc: Kalesh Singh <kaleshsingh@google.com>
Cc: Barry Song <v-songbaohua@oppo.com>
Cc: David Hildenbrand <david@redhat.com>
Cc: Peter Xu <peterx@redhat.com>
Signed-off-by: Lokesh Gidra <lokeshgidra@google.com>
---
 mm/userfaultfd.c | 179 +++++++++++++++++++++++++++++++++--------------
 1 file changed, 127 insertions(+), 52 deletions(-)

diff --git a/mm/userfaultfd.c b/mm/userfaultfd.c
index 8253978ee0fb..2465fb234671 100644
--- a/mm/userfaultfd.c
+++ b/mm/userfaultfd.c
@@ -1026,18 +1026,62 @@ static inline bool is_pte_pages_stable(pte_t *dst_pte, pte_t *src_pte,
 	       pmd_same(dst_pmdval, pmdp_get_lockless(dst_pmd));
 }
 
-static int move_present_pte(struct mm_struct *mm,
-			    struct vm_area_struct *dst_vma,
-			    struct vm_area_struct *src_vma,
-			    unsigned long dst_addr, unsigned long src_addr,
-			    pte_t *dst_pte, pte_t *src_pte,
-			    pte_t orig_dst_pte, pte_t orig_src_pte,
-			    pmd_t *dst_pmd, pmd_t dst_pmdval,
-			    spinlock_t *dst_ptl, spinlock_t *src_ptl,
-			    struct folio *src_folio)
+/*
+ * Checks if the two ptes and the corresponding folio are eligible for batched
+ * move. If so, then returns pointer to the folio, after locking it. Otherwise,
+ * returns NULL.
+ */
+static struct folio *check_ptes_for_batched_move(struct vm_area_struct *src_vma,
+						 unsigned long src_addr,
+						 pte_t *src_pte, pte_t *dst_pte)
+{
+	pte_t orig_dst_pte, orig_src_pte;
+	struct folio *folio;
+
+	orig_dst_pte = ptep_get(dst_pte);
+	if (!pte_none(orig_dst_pte))
+		return NULL;
+
+	orig_src_pte = ptep_get(src_pte);
+	if (pte_none(orig_src_pte))
+		return NULL;
+	if (!pte_present(orig_src_pte) || is_zero_pfn(pte_pfn(orig_src_pte)))
+		return NULL;
+
+	folio = vm_normal_folio(src_vma, src_addr, orig_src_pte);
+	if (!folio || !folio_trylock(folio))
+		return NULL;
+	if (!PageAnonExclusive(&folio->page) || folio_test_large(folio)) {
+		folio_unlock(folio);
+		return NULL;
+	}
+	return folio;
+}
+
+static long move_present_ptes(struct mm_struct *mm,
+			      struct vm_area_struct *dst_vma,
+			      struct vm_area_struct *src_vma,
+			      unsigned long dst_addr, unsigned long src_addr,
+			      pte_t *dst_pte, pte_t *src_pte,
+			      pte_t orig_dst_pte, pte_t orig_src_pte,
+			      pmd_t *dst_pmd, pmd_t dst_pmdval,
+			      spinlock_t *dst_ptl, spinlock_t *src_ptl,
+			      struct folio *src_folio, unsigned long len)
 {
 	int err = 0;
+	unsigned long src_start = src_addr;
+	unsigned long addr_end;
+
+	if (len > PAGE_SIZE) {
+		addr_end = (dst_addr + PMD_SIZE) & PMD_MASK;
+		if (dst_addr + len > addr_end)
+			len = addr_end - dst_addr;
 
+		addr_end = (src_addr + PMD_SIZE) & PMD_MASK;
+		if (src_addr + len > addr_end)
+			len = addr_end - src_addr;
+	}
+	flush_cache_range(src_vma, src_addr, src_addr + len);
 	double_pt_lock(dst_ptl, src_ptl);
 
 	if (!is_pte_pages_stable(dst_pte, src_pte, orig_dst_pte, orig_src_pte,
@@ -1051,31 +1095,60 @@ static int move_present_pte(struct mm_struct *mm,
 		err = -EBUSY;
 		goto out;
 	}
+	/* Avoid batching overhead for single page case */
+	if (len > PAGE_SIZE) {
+		flush_tlb_batched_pending(mm);
+		arch_enter_lazy_mmu_mode();
+		orig_src_pte = ptep_get_and_clear(mm, src_addr, src_pte);
+	} else
+		orig_src_pte = ptep_clear_flush(src_vma, src_addr, src_pte);
+
+	addr_end = src_start + len;
+	do {
+		/* Folio got pinned from under us. Put it back and fail the move. */
+		if (folio_maybe_dma_pinned(src_folio)) {
+			set_pte_at(mm, src_addr, src_pte, orig_src_pte);
+			err = -EBUSY;
+			break;
+		}
 
-	orig_src_pte = ptep_clear_flush(src_vma, src_addr, src_pte);
-	/* Folio got pinned from under us. Put it back and fail the move. */
-	if (folio_maybe_dma_pinned(src_folio)) {
-		set_pte_at(mm, src_addr, src_pte, orig_src_pte);
-		err = -EBUSY;
-		goto out;
-	}
-
-	folio_move_anon_rmap(src_folio, dst_vma);
-	src_folio->index = linear_page_index(dst_vma, dst_addr);
+		folio_move_anon_rmap(src_folio, dst_vma);
+		src_folio->index = linear_page_index(dst_vma, dst_addr);
 
-	orig_dst_pte = folio_mk_pte(src_folio, dst_vma->vm_page_prot);
-	/* Set soft dirty bit so userspace can notice the pte was moved */
+		orig_dst_pte = folio_mk_pte(src_folio, dst_vma->vm_page_prot);
+		/* Set soft dirty bit so userspace can notice the pte was moved */
 #ifdef CONFIG_MEM_SOFT_DIRTY
-	orig_dst_pte = pte_mksoft_dirty(orig_dst_pte);
+		orig_dst_pte = pte_mksoft_dirty(orig_dst_pte);
 #endif
-	if (pte_dirty(orig_src_pte))
-		orig_dst_pte = pte_mkdirty(orig_dst_pte);
-	orig_dst_pte = pte_mkwrite(orig_dst_pte, dst_vma);
+		if (pte_dirty(orig_src_pte))
+			orig_dst_pte = pte_mkdirty(orig_dst_pte);
+		orig_dst_pte = pte_mkwrite(orig_dst_pte, dst_vma);
+		set_pte_at(mm, dst_addr, dst_pte, orig_dst_pte);
+
+		src_addr += PAGE_SIZE;
+		if (src_addr == addr_end)
+			break;
+		src_pte++;
+		dst_pte++;
 
-	set_pte_at(mm, dst_addr, dst_pte, orig_dst_pte);
+		folio_unlock(src_folio);
+		src_folio = check_ptes_for_batched_move(src_vma, src_addr, src_pte, dst_pte);
+		if (!src_folio)
+			break;
+		orig_src_pte = ptep_get_and_clear(mm, src_addr, src_pte);
+		dst_addr += PAGE_SIZE;
+	} while (true);
+
+	if (len > PAGE_SIZE) {
+		arch_leave_lazy_mmu_mode();
+		if (src_addr > src_start)
+			flush_tlb_range(src_vma, src_start, src_addr);
+	}
 out:
 	double_pt_unlock(dst_ptl, src_ptl);
-	return err;
+	if (src_folio)
+		folio_unlock(src_folio);
+	return src_addr > src_start ? src_addr - src_start : err;
 }
 
 static int move_swap_pte(struct mm_struct *mm, struct vm_area_struct *dst_vma,
@@ -1140,7 +1213,7 @@ static int move_swap_pte(struct mm_struct *mm, struct vm_area_struct *dst_vma,
 	set_pte_at(mm, dst_addr, dst_pte, orig_src_pte);
 	double_pt_unlock(dst_ptl, src_ptl);
 
-	return 0;
+	return PAGE_SIZE;
 }
 
 static int move_zeropage_pte(struct mm_struct *mm,
@@ -1154,6 +1227,7 @@ static int move_zeropage_pte(struct mm_struct *mm,
 {
 	pte_t zero_pte;
 
+	flush_cache_range(src_vma, src_addr, src_addr + PAGE_SIZE);
 	double_pt_lock(dst_ptl, src_ptl);
 	if (!is_pte_pages_stable(dst_pte, src_pte, orig_dst_pte, orig_src_pte,
 				 dst_pmd, dst_pmdval)) {
@@ -1167,20 +1241,19 @@ static int move_zeropage_pte(struct mm_struct *mm,
 	set_pte_at(mm, dst_addr, dst_pte, zero_pte);
 	double_pt_unlock(dst_ptl, src_ptl);
 
-	return 0;
+	return PAGE_SIZE;
 }
 
 
 /*
- * The mmap_lock for reading is held by the caller. Just move the page
- * from src_pmd to dst_pmd if possible, and return true if succeeded
- * in moving the page.
+ * The mmap_lock for reading is held by the caller. Just move the page(s)
+ * from src_pmd to dst_pmd if possible, and return number of bytes moved.
  */
-static int move_pages_pte(struct mm_struct *mm, pmd_t *dst_pmd, pmd_t *src_pmd,
-			  struct vm_area_struct *dst_vma,
-			  struct vm_area_struct *src_vma,
-			  unsigned long dst_addr, unsigned long src_addr,
-			  __u64 mode)
+static long move_pages_ptes(struct mm_struct *mm, pmd_t *dst_pmd, pmd_t *src_pmd,
+			    struct vm_area_struct *dst_vma,
+			    struct vm_area_struct *src_vma,
+			    unsigned long dst_addr, unsigned long src_addr,
+			    unsigned long len, __u64 mode)
 {
 	swp_entry_t entry;
 	struct swap_info_struct *si = NULL;
@@ -1196,9 +1269,8 @@ static int move_pages_pte(struct mm_struct *mm, pmd_t *dst_pmd, pmd_t *src_pmd,
 	struct mmu_notifier_range range;
 	int err = 0;
 
-	flush_cache_range(src_vma, src_addr, src_addr + PAGE_SIZE);
 	mmu_notifier_range_init(&range, MMU_NOTIFY_CLEAR, 0, mm,
-				src_addr, src_addr + PAGE_SIZE);
+				src_addr, src_addr + len);
 	mmu_notifier_invalidate_range_start(&range);
 retry:
 	/*
@@ -1257,7 +1329,7 @@ static int move_pages_pte(struct mm_struct *mm, pmd_t *dst_pmd, pmd_t *src_pmd,
 		if (!(mode & UFFDIO_MOVE_MODE_ALLOW_SRC_HOLES))
 			err = -ENOENT;
 		else /* nothing to do to move a hole */
-			err = 0;
+			err = PAGE_SIZE;
 		goto out;
 	}
 
@@ -1375,10 +1447,13 @@ static int move_pages_pte(struct mm_struct *mm, pmd_t *dst_pmd, pmd_t *src_pmd,
 			}
 		}
 
-		err = move_present_pte(mm,  dst_vma, src_vma,
-				       dst_addr, src_addr, dst_pte, src_pte,
-				       orig_dst_pte, orig_src_pte, dst_pmd,
-				       dst_pmdval, dst_ptl, src_ptl, src_folio);
+		err = move_present_ptes(mm, dst_vma, src_vma,
+					dst_addr, src_addr, dst_pte, src_pte,
+					orig_dst_pte, orig_src_pte, dst_pmd,
+					dst_pmdval, dst_ptl, src_ptl, src_folio, len);
+		/* folio is already unlocked by move_present_ptes() */
+		folio_put(src_folio);
+		src_folio = NULL;
 	} else {
 		struct folio *folio = NULL;
 
@@ -1732,7 +1807,7 @@ ssize_t move_pages(struct userfaultfd_ctx *ctx, unsigned long dst_start,
 {
 	struct mm_struct *mm = ctx->mm;
 	struct vm_area_struct *src_vma, *dst_vma;
-	unsigned long src_addr, dst_addr;
+	unsigned long src_addr, dst_addr, src_end;
 	pmd_t *src_pmd, *dst_pmd;
 	long err = -EINVAL;
 	ssize_t moved = 0;
@@ -1777,8 +1852,8 @@ ssize_t move_pages(struct userfaultfd_ctx *ctx, unsigned long dst_start,
 	if (err)
 		goto out_unlock;
 
-	for (src_addr = src_start, dst_addr = dst_start;
-	     src_addr < src_start + len;) {
+	for (src_addr = src_start, dst_addr = dst_start, src_end = src_start + len;
+	     src_addr < src_end;) {
 		spinlock_t *ptl;
 		pmd_t dst_pmdval;
 		unsigned long step_size;
@@ -1865,10 +1940,10 @@ ssize_t move_pages(struct userfaultfd_ctx *ctx, unsigned long dst_start,
 				break;
 			}
 
-			err = move_pages_pte(mm, dst_pmd, src_pmd,
-					     dst_vma, src_vma,
-					     dst_addr, src_addr, mode);
-			step_size = PAGE_SIZE;
+			err = move_pages_ptes(mm, dst_pmd, src_pmd,
+					      dst_vma, src_vma, dst_addr,
+					      src_addr, src_end - src_addr, mode);
+			step_size = err;
 		}
 
 		cond_resched();
@@ -1880,7 +1955,7 @@ ssize_t move_pages(struct userfaultfd_ctx *ctx, unsigned long dst_start,
 			break;
 		}
 
-		if (err) {
+		if (err < 0) {
 			if (err == -EAGAIN)
 				continue;
 			break;

base-commit: 260f6f4fda93c8485c8037865c941b42b9cba5d2
-- 
2.50.1.552.g942d659e1b-goog



^ permalink raw reply related	[flat|nested] 5+ messages in thread

* Re: [PATCH] userfaultfd: opportunistic TLB-flush batching for present pages in MOVE
  2025-07-31 10:47 [PATCH] userfaultfd: opportunistic TLB-flush batching for present pages in MOVE Lokesh Gidra
@ 2025-08-05  4:35 ` Barry Song
  2025-08-05  6:30   ` Lokesh Gidra
  0 siblings, 1 reply; 5+ messages in thread
From: Barry Song @ 2025-08-05  4:35 UTC (permalink / raw)
  To: Lokesh Gidra
  Cc: akpm, aarcange, linux-mm, linux-kernel, ngeoffray,
	Suren Baghdasaryan, Kalesh Singh, Barry Song, David Hildenbrand,
	Peter Xu

On Thu, Jul 31, 2025 at 6:47 PM Lokesh Gidra <lokeshgidra@google.com> wrote:
>
> MOVE ioctl's runtime is dominated by TLB-flush cost, which is required
> for moving present pages. Mitigate this cost by opportunistically
> batching present contiguous pages for TLB flushing.
>
> Without batching, in our testing on an arm64 Android device with UFFD GC,
> which uses MOVE ioctl for compaction, we observed that out of the total
> time spent in move_pages_pte(), over 40% is in ptep_clear_flush(), and
> ~20% in vm_normal_folio().
>
> With batching, the proportion of vm_normal_folio() increases to over
> 70% of move_pages_pte() without any changes to vm_normal_folio().
> Furthermore, time spent within move_pages_pte() is only ~20%, which
> includes TLB-flush overhead.
>
> Cc: Suren Baghdasaryan <surenb@google.com>
> Cc: Kalesh Singh <kaleshsingh@google.com>
> Cc: Barry Song <v-songbaohua@oppo.com>
> Cc: David Hildenbrand <david@redhat.com>
> Cc: Peter Xu <peterx@redhat.com>
> Signed-off-by: Lokesh Gidra <lokeshgidra@google.com>
> ---
>  mm/userfaultfd.c | 179 +++++++++++++++++++++++++++++++++--------------
>  1 file changed, 127 insertions(+), 52 deletions(-)
>
> diff --git a/mm/userfaultfd.c b/mm/userfaultfd.c
> index 8253978ee0fb..2465fb234671 100644
> --- a/mm/userfaultfd.c
> +++ b/mm/userfaultfd.c
> @@ -1026,18 +1026,62 @@ static inline bool is_pte_pages_stable(pte_t *dst_pte, pte_t *src_pte,
>                pmd_same(dst_pmdval, pmdp_get_lockless(dst_pmd));
>  }
>
> -static int move_present_pte(struct mm_struct *mm,
> -                           struct vm_area_struct *dst_vma,
> -                           struct vm_area_struct *src_vma,
> -                           unsigned long dst_addr, unsigned long src_addr,
> -                           pte_t *dst_pte, pte_t *src_pte,
> -                           pte_t orig_dst_pte, pte_t orig_src_pte,
> -                           pmd_t *dst_pmd, pmd_t dst_pmdval,
> -                           spinlock_t *dst_ptl, spinlock_t *src_ptl,
> -                           struct folio *src_folio)
> +/*
> + * Checks if the two ptes and the corresponding folio are eligible for batched
> + * move. If so, then returns pointer to the folio, after locking it. Otherwise,
> + * returns NULL.
> + */
> +static struct folio *check_ptes_for_batched_move(struct vm_area_struct *src_vma,
> +                                                unsigned long src_addr,
> +                                                pte_t *src_pte, pte_t *dst_pte)
> +{
> +       pte_t orig_dst_pte, orig_src_pte;
> +       struct folio *folio;
> +
> +       orig_dst_pte = ptep_get(dst_pte);
> +       if (!pte_none(orig_dst_pte))
> +               return NULL;
> +
> +       orig_src_pte = ptep_get(src_pte);
> +       if (pte_none(orig_src_pte))
> +               return NULL;
> +       if (!pte_present(orig_src_pte) || is_zero_pfn(pte_pfn(orig_src_pte)))
> +               return NULL;
> +
> +       folio = vm_normal_folio(src_vma, src_addr, orig_src_pte);
> +       if (!folio || !folio_trylock(folio))
> +               return NULL;
> +       if (!PageAnonExclusive(&folio->page) || folio_test_large(folio)) {
> +               folio_unlock(folio);
> +               return NULL;
> +       }
> +       return folio;
> +}
> +
> +static long move_present_ptes(struct mm_struct *mm,
> +                             struct vm_area_struct *dst_vma,
> +                             struct vm_area_struct *src_vma,
> +                             unsigned long dst_addr, unsigned long src_addr,
> +                             pte_t *dst_pte, pte_t *src_pte,
> +                             pte_t orig_dst_pte, pte_t orig_src_pte,
> +                             pmd_t *dst_pmd, pmd_t dst_pmdval,
> +                             spinlock_t *dst_ptl, spinlock_t *src_ptl,
> +                             struct folio *src_folio, unsigned long len)
>  {
>         int err = 0;
> +       unsigned long src_start = src_addr;
> +       unsigned long addr_end;
> +
> +       if (len > PAGE_SIZE) {
> +               addr_end = (dst_addr + PMD_SIZE) & PMD_MASK;
> +               if (dst_addr + len > addr_end)
> +                       len = addr_end - dst_addr;
>
> +               addr_end = (src_addr + PMD_SIZE) & PMD_MASK;
> +               if (src_addr + len > addr_end)
> +                       len = addr_end - src_addr;
> +       }
> +       flush_cache_range(src_vma, src_addr, src_addr + len);
>         double_pt_lock(dst_ptl, src_ptl);
>
>         if (!is_pte_pages_stable(dst_pte, src_pte, orig_dst_pte, orig_src_pte,
> @@ -1051,31 +1095,60 @@ static int move_present_pte(struct mm_struct *mm,
>                 err = -EBUSY;
>                 goto out;
>         }
> +       /* Avoid batching overhead for single page case */
> +       if (len > PAGE_SIZE) {
> +               flush_tlb_batched_pending(mm);

What’s confusing to me is that they track the unmapping of multiple
consecutive PTEs and defer TLB invalidation until later.
In contrast, you’re not tracking anything and instead call
flush_tlb_range() directly, which triggers the flush immediately.

It seems you might be combining two different batching approaches.
From what I can tell, you're essentially using flush_range
as a replacement for flushing each entry individually.

> +               arch_enter_lazy_mmu_mode();
> +               orig_src_pte = ptep_get_and_clear(mm, src_addr, src_pte);
> +       } else
> +               orig_src_pte = ptep_clear_flush(src_vma, src_addr, src_pte);
> +
> +       addr_end = src_start + len;
> +       do {
> +               /* Folio got pinned from under us. Put it back and fail the move. */
> +               if (folio_maybe_dma_pinned(src_folio)) {
> +                       set_pte_at(mm, src_addr, src_pte, orig_src_pte);
> +                       err = -EBUSY;
> +                       break;
> +               }
>
> -       orig_src_pte = ptep_clear_flush(src_vma, src_addr, src_pte);
> -       /* Folio got pinned from under us. Put it back and fail the move. */
> -       if (folio_maybe_dma_pinned(src_folio)) {
> -               set_pte_at(mm, src_addr, src_pte, orig_src_pte);
> -               err = -EBUSY;
> -               goto out;
> -       }
> -
> -       folio_move_anon_rmap(src_folio, dst_vma);
> -       src_folio->index = linear_page_index(dst_vma, dst_addr);
> +               folio_move_anon_rmap(src_folio, dst_vma);
> +               src_folio->index = linear_page_index(dst_vma, dst_addr);
>
> -       orig_dst_pte = folio_mk_pte(src_folio, dst_vma->vm_page_prot);
> -       /* Set soft dirty bit so userspace can notice the pte was moved */
> +               orig_dst_pte = folio_mk_pte(src_folio, dst_vma->vm_page_prot);
> +               /* Set soft dirty bit so userspace can notice the pte was moved */
>  #ifdef CONFIG_MEM_SOFT_DIRTY
> -       orig_dst_pte = pte_mksoft_dirty(orig_dst_pte);
> +               orig_dst_pte = pte_mksoft_dirty(orig_dst_pte);
>  #endif
> -       if (pte_dirty(orig_src_pte))
> -               orig_dst_pte = pte_mkdirty(orig_dst_pte);
> -       orig_dst_pte = pte_mkwrite(orig_dst_pte, dst_vma);
> +               if (pte_dirty(orig_src_pte))
> +                       orig_dst_pte = pte_mkdirty(orig_dst_pte);
> +               orig_dst_pte = pte_mkwrite(orig_dst_pte, dst_vma);
> +               set_pte_at(mm, dst_addr, dst_pte, orig_dst_pte);
> +
> +               src_addr += PAGE_SIZE;
> +               if (src_addr == addr_end)
> +                       break;
> +               src_pte++;
> +               dst_pte++;
>
> -       set_pte_at(mm, dst_addr, dst_pte, orig_dst_pte);
> +               folio_unlock(src_folio);
> +               src_folio = check_ptes_for_batched_move(src_vma, src_addr, src_pte, dst_pte);
> +               if (!src_folio)
> +                       break;
> +               orig_src_pte = ptep_get_and_clear(mm, src_addr, src_pte);
> +               dst_addr += PAGE_SIZE;
> +       } while (true);
> +
> +       if (len > PAGE_SIZE) {
> +               arch_leave_lazy_mmu_mode();
> +               if (src_addr > src_start)
> +                       flush_tlb_range(src_vma, src_start, src_addr);
> +       }

Can't we just remove the `if (len > PAGE_SIZE)` check and unify the
handling for both single-page and multi-page cases?


Thanks
Barry


^ permalink raw reply	[flat|nested] 5+ messages in thread

* Re: [PATCH] userfaultfd: opportunistic TLB-flush batching for present pages in MOVE
  2025-08-05  4:35 ` Barry Song
@ 2025-08-05  6:30   ` Lokesh Gidra
  2025-08-05 10:21     ` Barry Song
  0 siblings, 1 reply; 5+ messages in thread
From: Lokesh Gidra @ 2025-08-05  6:30 UTC (permalink / raw)
  To: Barry Song
  Cc: akpm, aarcange, linux-mm, linux-kernel, ngeoffray,
	Suren Baghdasaryan, Kalesh Singh, Barry Song, David Hildenbrand,
	Peter Xu

On Mon, Aug 4, 2025 at 9:35 PM Barry Song <21cnbao@gmail.com> wrote:
>
> On Thu, Jul 31, 2025 at 6:47 PM Lokesh Gidra <lokeshgidra@google.com> wrote:
> >
> > MOVE ioctl's runtime is dominated by TLB-flush cost, which is required
> > for moving present pages. Mitigate this cost by opportunistically
> > batching present contiguous pages for TLB flushing.
> >
> > Without batching, in our testing on an arm64 Android device with UFFD GC,
> > which uses MOVE ioctl for compaction, we observed that out of the total
> > time spent in move_pages_pte(), over 40% is in ptep_clear_flush(), and
> > ~20% in vm_normal_folio().
> >
> > With batching, the proportion of vm_normal_folio() increases to over
> > 70% of move_pages_pte() without any changes to vm_normal_folio().
> > Furthermore, time spent within move_pages_pte() is only ~20%, which
> > includes TLB-flush overhead.
> >
> > Cc: Suren Baghdasaryan <surenb@google.com>
> > Cc: Kalesh Singh <kaleshsingh@google.com>
> > Cc: Barry Song <v-songbaohua@oppo.com>
> > Cc: David Hildenbrand <david@redhat.com>
> > Cc: Peter Xu <peterx@redhat.com>
> > Signed-off-by: Lokesh Gidra <lokeshgidra@google.com>
> > ---
> >  mm/userfaultfd.c | 179 +++++++++++++++++++++++++++++++++--------------
> >  1 file changed, 127 insertions(+), 52 deletions(-)
> >
> > diff --git a/mm/userfaultfd.c b/mm/userfaultfd.c
> > index 8253978ee0fb..2465fb234671 100644
> > --- a/mm/userfaultfd.c
> > +++ b/mm/userfaultfd.c
> > @@ -1026,18 +1026,62 @@ static inline bool is_pte_pages_stable(pte_t *dst_pte, pte_t *src_pte,
> >                pmd_same(dst_pmdval, pmdp_get_lockless(dst_pmd));
> >  }
> >
> > -static int move_present_pte(struct mm_struct *mm,
> > -                           struct vm_area_struct *dst_vma,
> > -                           struct vm_area_struct *src_vma,
> > -                           unsigned long dst_addr, unsigned long src_addr,
> > -                           pte_t *dst_pte, pte_t *src_pte,
> > -                           pte_t orig_dst_pte, pte_t orig_src_pte,
> > -                           pmd_t *dst_pmd, pmd_t dst_pmdval,
> > -                           spinlock_t *dst_ptl, spinlock_t *src_ptl,
> > -                           struct folio *src_folio)
> > +/*
> > + * Checks if the two ptes and the corresponding folio are eligible for batched
> > + * move. If so, then returns pointer to the folio, after locking it. Otherwise,
> > + * returns NULL.
> > + */
> > +static struct folio *check_ptes_for_batched_move(struct vm_area_struct *src_vma,
> > +                                                unsigned long src_addr,
> > +                                                pte_t *src_pte, pte_t *dst_pte)
> > +{
> > +       pte_t orig_dst_pte, orig_src_pte;
> > +       struct folio *folio;
> > +
> > +       orig_dst_pte = ptep_get(dst_pte);
> > +       if (!pte_none(orig_dst_pte))
> > +               return NULL;
> > +
> > +       orig_src_pte = ptep_get(src_pte);
> > +       if (pte_none(orig_src_pte))
> > +               return NULL;
> > +       if (!pte_present(orig_src_pte) || is_zero_pfn(pte_pfn(orig_src_pte)))
> > +               return NULL;
> > +
> > +       folio = vm_normal_folio(src_vma, src_addr, orig_src_pte);
> > +       if (!folio || !folio_trylock(folio))
> > +               return NULL;
> > +       if (!PageAnonExclusive(&folio->page) || folio_test_large(folio)) {
> > +               folio_unlock(folio);
> > +               return NULL;
> > +       }
> > +       return folio;
> > +}
> > +
> > +static long move_present_ptes(struct mm_struct *mm,
> > +                             struct vm_area_struct *dst_vma,
> > +                             struct vm_area_struct *src_vma,
> > +                             unsigned long dst_addr, unsigned long src_addr,
> > +                             pte_t *dst_pte, pte_t *src_pte,
> > +                             pte_t orig_dst_pte, pte_t orig_src_pte,
> > +                             pmd_t *dst_pmd, pmd_t dst_pmdval,
> > +                             spinlock_t *dst_ptl, spinlock_t *src_ptl,
> > +                             struct folio *src_folio, unsigned long len)
> >  {
> >         int err = 0;
> > +       unsigned long src_start = src_addr;
> > +       unsigned long addr_end;
> > +
> > +       if (len > PAGE_SIZE) {
> > +               addr_end = (dst_addr + PMD_SIZE) & PMD_MASK;
> > +               if (dst_addr + len > addr_end)
> > +                       len = addr_end - dst_addr;
> >
> > +               addr_end = (src_addr + PMD_SIZE) & PMD_MASK;
> > +               if (src_addr + len > addr_end)
> > +                       len = addr_end - src_addr;
> > +       }
> > +       flush_cache_range(src_vma, src_addr, src_addr + len);
> >         double_pt_lock(dst_ptl, src_ptl);
> >
> >         if (!is_pte_pages_stable(dst_pte, src_pte, orig_dst_pte, orig_src_pte,
> > @@ -1051,31 +1095,60 @@ static int move_present_pte(struct mm_struct *mm,
> >                 err = -EBUSY;
> >                 goto out;
> >         }
> > +       /* Avoid batching overhead for single page case */
> > +       if (len > PAGE_SIZE) {
> > +               flush_tlb_batched_pending(mm);
>
> What’s confusing to me is that they track the unmapping of multiple
> consecutive PTEs and defer TLB invalidation until later.
> In contrast, you’re not tracking anything and instead call
> flush_tlb_range() directly, which triggers the flush immediately.
>
> It seems you might be combining two different batching approaches.

These changes I made are in line with how mremap() does batching. See
move_ptes() in mm/mremap.c

From the comment in flush_tlb_batched_pending() [1] it seems necessary
in this case too. Please correct me if I'm wrong. I'll be happy to
remove it if it's not required.

[1] https://elixir.bootlin.com/linux/v6.16/source/mm/rmap.c#L728

> From what I can tell, you're essentially using flush_range
> as a replacement for flushing each entry individually.

That's correct. The idea is to reduce the number of IPIs required for
flushing the TLB entries. Since it is quite common that the ioctl is
invoked with several pages in one go, this greatly benefits.

>
> > +               arch_enter_lazy_mmu_mode();
> > +               orig_src_pte = ptep_get_and_clear(mm, src_addr, src_pte);
> > +       } else
> > +               orig_src_pte = ptep_clear_flush(src_vma, src_addr, src_pte);
> > +
> > +       addr_end = src_start + len;
> > +       do {
> > +               /* Folio got pinned from under us. Put it back and fail the move. */
> > +               if (folio_maybe_dma_pinned(src_folio)) {
> > +                       set_pte_at(mm, src_addr, src_pte, orig_src_pte);
> > +                       err = -EBUSY;
> > +                       break;
> > +               }
> >
> > -       orig_src_pte = ptep_clear_flush(src_vma, src_addr, src_pte);
> > -       /* Folio got pinned from under us. Put it back and fail the move. */
> > -       if (folio_maybe_dma_pinned(src_folio)) {
> > -               set_pte_at(mm, src_addr, src_pte, orig_src_pte);
> > -               err = -EBUSY;
> > -               goto out;
> > -       }
> > -
> > -       folio_move_anon_rmap(src_folio, dst_vma);
> > -       src_folio->index = linear_page_index(dst_vma, dst_addr);
> > +               folio_move_anon_rmap(src_folio, dst_vma);
> > +               src_folio->index = linear_page_index(dst_vma, dst_addr);
> >
> > -       orig_dst_pte = folio_mk_pte(src_folio, dst_vma->vm_page_prot);
> > -       /* Set soft dirty bit so userspace can notice the pte was moved */
> > +               orig_dst_pte = folio_mk_pte(src_folio, dst_vma->vm_page_prot);
> > +               /* Set soft dirty bit so userspace can notice the pte was moved */
> >  #ifdef CONFIG_MEM_SOFT_DIRTY
> > -       orig_dst_pte = pte_mksoft_dirty(orig_dst_pte);
> > +               orig_dst_pte = pte_mksoft_dirty(orig_dst_pte);
> >  #endif
> > -       if (pte_dirty(orig_src_pte))
> > -               orig_dst_pte = pte_mkdirty(orig_dst_pte);
> > -       orig_dst_pte = pte_mkwrite(orig_dst_pte, dst_vma);
> > +               if (pte_dirty(orig_src_pte))
> > +                       orig_dst_pte = pte_mkdirty(orig_dst_pte);
> > +               orig_dst_pte = pte_mkwrite(orig_dst_pte, dst_vma);
> > +               set_pte_at(mm, dst_addr, dst_pte, orig_dst_pte);
> > +
> > +               src_addr += PAGE_SIZE;
> > +               if (src_addr == addr_end)
> > +                       break;
> > +               src_pte++;
> > +               dst_pte++;
> >
> > -       set_pte_at(mm, dst_addr, dst_pte, orig_dst_pte);
> > +               folio_unlock(src_folio);
> > +               src_folio = check_ptes_for_batched_move(src_vma, src_addr, src_pte, dst_pte);
> > +               if (!src_folio)
> > +                       break;
> > +               orig_src_pte = ptep_get_and_clear(mm, src_addr, src_pte);
> > +               dst_addr += PAGE_SIZE;
> > +       } while (true);
> > +
> > +       if (len > PAGE_SIZE) {
> > +               arch_leave_lazy_mmu_mode();
> > +               if (src_addr > src_start)
> > +                       flush_tlb_range(src_vma, src_start, src_addr);
> > +       }
>
> Can't we just remove the `if (len > PAGE_SIZE)` check and unify the
> handling for both single-page and multi-page cases?

We certainly can. Initially it seemed to me that lazy/batched
invalidation has its own overhead and I wanted to avoid that in the
single-page case because the ioctl does get called for single pages
quite a bit. That too in time sensitive code paths. However, on a
deeper relook now, I noticed it's not really that different.

I'll unify in the next patch. Thanks for the suggestion.
>
>
> Thanks
> Barry


^ permalink raw reply	[flat|nested] 5+ messages in thread

* Re: [PATCH] userfaultfd: opportunistic TLB-flush batching for present pages in MOVE
  2025-08-05  6:30   ` Lokesh Gidra
@ 2025-08-05 10:21     ` Barry Song
  2025-08-05 10:36       ` Lokesh Gidra
  0 siblings, 1 reply; 5+ messages in thread
From: Barry Song @ 2025-08-05 10:21 UTC (permalink / raw)
  To: Lokesh Gidra
  Cc: akpm, aarcange, linux-mm, linux-kernel, ngeoffray,
	Suren Baghdasaryan, Kalesh Singh, Barry Song, David Hildenbrand,
	Peter Xu

On Tue, Aug 5, 2025 at 2:30 PM Lokesh Gidra <lokeshgidra@google.com> wrote:
>
> On Mon, Aug 4, 2025 at 9:35 PM Barry Song <21cnbao@gmail.com> wrote:
> >
> > On Thu, Jul 31, 2025 at 6:47 PM Lokesh Gidra <lokeshgidra@google.com> wrote:
> > >
> > > MOVE ioctl's runtime is dominated by TLB-flush cost, which is required
> > > for moving present pages. Mitigate this cost by opportunistically
> > > batching present contiguous pages for TLB flushing.
> > >
> > > Without batching, in our testing on an arm64 Android device with UFFD GC,
> > > which uses MOVE ioctl for compaction, we observed that out of the total
> > > time spent in move_pages_pte(), over 40% is in ptep_clear_flush(), and
> > > ~20% in vm_normal_folio().
> > >
> > > With batching, the proportion of vm_normal_folio() increases to over
> > > 70% of move_pages_pte() without any changes to vm_normal_folio().
> > > Furthermore, time spent within move_pages_pte() is only ~20%, which
> > > includes TLB-flush overhead.
> > >
> > > Cc: Suren Baghdasaryan <surenb@google.com>
> > > Cc: Kalesh Singh <kaleshsingh@google.com>
> > > Cc: Barry Song <v-songbaohua@oppo.com>
> > > Cc: David Hildenbrand <david@redhat.com>
> > > Cc: Peter Xu <peterx@redhat.com>
> > > Signed-off-by: Lokesh Gidra <lokeshgidra@google.com>
> > > ---
> > >  mm/userfaultfd.c | 179 +++++++++++++++++++++++++++++++++--------------
> > >  1 file changed, 127 insertions(+), 52 deletions(-)
> > >
> > > diff --git a/mm/userfaultfd.c b/mm/userfaultfd.c
> > > index 8253978ee0fb..2465fb234671 100644
> > > --- a/mm/userfaultfd.c
> > > +++ b/mm/userfaultfd.c
> > > @@ -1026,18 +1026,62 @@ static inline bool is_pte_pages_stable(pte_t *dst_pte, pte_t *src_pte,
> > >                pmd_same(dst_pmdval, pmdp_get_lockless(dst_pmd));
> > >  }
> > >
> > > -static int move_present_pte(struct mm_struct *mm,
> > > -                           struct vm_area_struct *dst_vma,
> > > -                           struct vm_area_struct *src_vma,
> > > -                           unsigned long dst_addr, unsigned long src_addr,
> > > -                           pte_t *dst_pte, pte_t *src_pte,
> > > -                           pte_t orig_dst_pte, pte_t orig_src_pte,
> > > -                           pmd_t *dst_pmd, pmd_t dst_pmdval,
> > > -                           spinlock_t *dst_ptl, spinlock_t *src_ptl,
> > > -                           struct folio *src_folio)
> > > +/*
> > > + * Checks if the two ptes and the corresponding folio are eligible for batched
> > > + * move. If so, then returns pointer to the folio, after locking it. Otherwise,
> > > + * returns NULL.
> > > + */
> > > +static struct folio *check_ptes_for_batched_move(struct vm_area_struct *src_vma,
> > > +                                                unsigned long src_addr,
> > > +                                                pte_t *src_pte, pte_t *dst_pte)
> > > +{
> > > +       pte_t orig_dst_pte, orig_src_pte;
> > > +       struct folio *folio;
> > > +
> > > +       orig_dst_pte = ptep_get(dst_pte);
> > > +       if (!pte_none(orig_dst_pte))
> > > +               return NULL;
> > > +
> > > +       orig_src_pte = ptep_get(src_pte);
> > > +       if (pte_none(orig_src_pte))
> > > +               return NULL;
> > > +       if (!pte_present(orig_src_pte) || is_zero_pfn(pte_pfn(orig_src_pte)))
> > > +               return NULL;
> > > +
> > > +       folio = vm_normal_folio(src_vma, src_addr, orig_src_pte);
> > > +       if (!folio || !folio_trylock(folio))
> > > +               return NULL;
> > > +       if (!PageAnonExclusive(&folio->page) || folio_test_large(folio)) {
> > > +               folio_unlock(folio);
> > > +               return NULL;
> > > +       }
> > > +       return folio;
> > > +}
> > > +
> > > +static long move_present_ptes(struct mm_struct *mm,
> > > +                             struct vm_area_struct *dst_vma,
> > > +                             struct vm_area_struct *src_vma,
> > > +                             unsigned long dst_addr, unsigned long src_addr,
> > > +                             pte_t *dst_pte, pte_t *src_pte,
> > > +                             pte_t orig_dst_pte, pte_t orig_src_pte,
> > > +                             pmd_t *dst_pmd, pmd_t dst_pmdval,
> > > +                             spinlock_t *dst_ptl, spinlock_t *src_ptl,
> > > +                             struct folio *src_folio, unsigned long len)
> > >  {
> > >         int err = 0;
> > > +       unsigned long src_start = src_addr;
> > > +       unsigned long addr_end;
> > > +
> > > +       if (len > PAGE_SIZE) {
> > > +               addr_end = (dst_addr + PMD_SIZE) & PMD_MASK;
> > > +               if (dst_addr + len > addr_end)
> > > +                       len = addr_end - dst_addr;
> > >
> > > +               addr_end = (src_addr + PMD_SIZE) & PMD_MASK;
> > > +               if (src_addr + len > addr_end)
> > > +                       len = addr_end - src_addr;
> > > +       }
> > > +       flush_cache_range(src_vma, src_addr, src_addr + len);
> > >         double_pt_lock(dst_ptl, src_ptl);
> > >
> > >         if (!is_pte_pages_stable(dst_pte, src_pte, orig_dst_pte, orig_src_pte,
> > > @@ -1051,31 +1095,60 @@ static int move_present_pte(struct mm_struct *mm,
> > >                 err = -EBUSY;
> > >                 goto out;
> > >         }
> > > +       /* Avoid batching overhead for single page case */
> > > +       if (len > PAGE_SIZE) {
> > > +               flush_tlb_batched_pending(mm);
> >
> > What’s confusing to me is that they track the unmapping of multiple
> > consecutive PTEs and defer TLB invalidation until later.
> > In contrast, you’re not tracking anything and instead call
> > flush_tlb_range() directly, which triggers the flush immediately.
> >
> > It seems you might be combining two different batching approaches.
>
> These changes I made are in line with how mremap() does batching. See
> move_ptes() in mm/mremap.c
>
> From the comment in flush_tlb_batched_pending() [1] it seems necessary
> in this case too. Please correct me if I'm wrong. I'll be happy to
> remove it if it's not required.
>
> [1] https://elixir.bootlin.com/linux/v6.16/source/mm/rmap.c#L728

Whether we need flush_tlb_batched_pending() has nothing to do with your
patch. It's entirely about synchronizing with other pending TLBIs, such as
those from try_to_unmap_one() and try_to_migrate_one().

In short, if it's needed, it's needed regardless of whether your patch is
applied or whether you're dealing with len > PAGE_SIZE.

>
> > From what I can tell, you're essentially using flush_range
> > as a replacement for flushing each entry individually.
>
> That's correct. The idea is to reduce the number of IPIs required for
> flushing the TLB entries. Since it is quite common that the ioctl is
> invoked with several pages in one go, this greatly benefits.
>
> >
> > > +               arch_enter_lazy_mmu_mode();
> > > +               orig_src_pte = ptep_get_and_clear(mm, src_addr, src_pte);
> > > +       } else
> > > +               orig_src_pte = ptep_clear_flush(src_vma, src_addr, src_pte);
> > > +
> > > +       addr_end = src_start + len;
> > > +       do {
> > > +               /* Folio got pinned from under us. Put it back and fail the move. */
> > > +               if (folio_maybe_dma_pinned(src_folio)) {
> > > +                       set_pte_at(mm, src_addr, src_pte, orig_src_pte);
> > > +                       err = -EBUSY;
> > > +                       break;
> > > +               }
> > >
> > > -       orig_src_pte = ptep_clear_flush(src_vma, src_addr, src_pte);
> > > -       /* Folio got pinned from under us. Put it back and fail the move. */
> > > -       if (folio_maybe_dma_pinned(src_folio)) {
> > > -               set_pte_at(mm, src_addr, src_pte, orig_src_pte);
> > > -               err = -EBUSY;
> > > -               goto out;
> > > -       }
> > > -
> > > -       folio_move_anon_rmap(src_folio, dst_vma);
> > > -       src_folio->index = linear_page_index(dst_vma, dst_addr);
> > > +               folio_move_anon_rmap(src_folio, dst_vma);
> > > +               src_folio->index = linear_page_index(dst_vma, dst_addr);
> > >
> > > -       orig_dst_pte = folio_mk_pte(src_folio, dst_vma->vm_page_prot);
> > > -       /* Set soft dirty bit so userspace can notice the pte was moved */
> > > +               orig_dst_pte = folio_mk_pte(src_folio, dst_vma->vm_page_prot);
> > > +               /* Set soft dirty bit so userspace can notice the pte was moved */
> > >  #ifdef CONFIG_MEM_SOFT_DIRTY
> > > -       orig_dst_pte = pte_mksoft_dirty(orig_dst_pte);
> > > +               orig_dst_pte = pte_mksoft_dirty(orig_dst_pte);
> > >  #endif
> > > -       if (pte_dirty(orig_src_pte))
> > > -               orig_dst_pte = pte_mkdirty(orig_dst_pte);
> > > -       orig_dst_pte = pte_mkwrite(orig_dst_pte, dst_vma);
> > > +               if (pte_dirty(orig_src_pte))
> > > +                       orig_dst_pte = pte_mkdirty(orig_dst_pte);
> > > +               orig_dst_pte = pte_mkwrite(orig_dst_pte, dst_vma);
> > > +               set_pte_at(mm, dst_addr, dst_pte, orig_dst_pte);
> > > +
> > > +               src_addr += PAGE_SIZE;
> > > +               if (src_addr == addr_end)
> > > +                       break;
> > > +               src_pte++;
> > > +               dst_pte++;
> > >
> > > -       set_pte_at(mm, dst_addr, dst_pte, orig_dst_pte);
> > > +               folio_unlock(src_folio);
> > > +               src_folio = check_ptes_for_batched_move(src_vma, src_addr, src_pte, dst_pte);
> > > +               if (!src_folio)
> > > +                       break;
> > > +               orig_src_pte = ptep_get_and_clear(mm, src_addr, src_pte);
> > > +               dst_addr += PAGE_SIZE;
> > > +       } while (true);
> > > +
> > > +       if (len > PAGE_SIZE) {
> > > +               arch_leave_lazy_mmu_mode();
> > > +               if (src_addr > src_start)
> > > +                       flush_tlb_range(src_vma, src_start, src_addr);
> > > +       }
> >
> > Can't we just remove the `if (len > PAGE_SIZE)` check and unify the
> > handling for both single-page and multi-page cases?
>
> We certainly can. Initially it seemed to me that lazy/batched
> invalidation has its own overhead and I wanted to avoid that in the
> single-page case because the ioctl does get called for single pages
> quite a bit. That too in time sensitive code paths. However, on a
> deeper relook now, I noticed it's not really that different.
>
> I'll unify in the next patch. Thanks for the suggestion.

Yes, that would be nice — especially since flush_tlb_batched_pending()
is not needed in this patch.

Whether it's needed for uffd_move is a separate matter and should be
addressed in a separate patch, if necessary — for example, if there's a
similar race as described in Commit 3ea277194daa
("mm, mprotect: flush TLB if potentially racing with a parallel reclaim
leaving stale TLB entries").



            CPU0                            CPU1

            ----                            ----

                                            user accesses memory using RW PTE

                                            [PTE now cached in TLB]

            try_to_unmap_one()

            ==> ptep_get_and_clear()

            ==> set_tlb_ubc_flush_pending()

                                            mprotect(addr, PROT_READ)

                                            ==> change_pte_range()

                                            ==> [ PTE non-present - no flush ]



                                            user writes using cached RW PTE

            ...

            try_to_unmap_flush()

Thanks
Barry


^ permalink raw reply	[flat|nested] 5+ messages in thread

* Re: [PATCH] userfaultfd: opportunistic TLB-flush batching for present pages in MOVE
  2025-08-05 10:21     ` Barry Song
@ 2025-08-05 10:36       ` Lokesh Gidra
  0 siblings, 0 replies; 5+ messages in thread
From: Lokesh Gidra @ 2025-08-05 10:36 UTC (permalink / raw)
  To: Barry Song
  Cc: akpm, aarcange, linux-mm, linux-kernel, ngeoffray,
	Suren Baghdasaryan, Kalesh Singh, Barry Song, David Hildenbrand,
	Peter Xu

On Tue, Aug 5, 2025 at 3:21 AM Barry Song <21cnbao@gmail.com> wrote:
>
> On Tue, Aug 5, 2025 at 2:30 PM Lokesh Gidra <lokeshgidra@google.com> wrote:
> >
> > On Mon, Aug 4, 2025 at 9:35 PM Barry Song <21cnbao@gmail.com> wrote:
> > >
> > > On Thu, Jul 31, 2025 at 6:47 PM Lokesh Gidra <lokeshgidra@google.com> wrote:
> > > >
> > > > MOVE ioctl's runtime is dominated by TLB-flush cost, which is required
> > > > for moving present pages. Mitigate this cost by opportunistically
> > > > batching present contiguous pages for TLB flushing.
> > > >
> > > > Without batching, in our testing on an arm64 Android device with UFFD GC,
> > > > which uses MOVE ioctl for compaction, we observed that out of the total
> > > > time spent in move_pages_pte(), over 40% is in ptep_clear_flush(), and
> > > > ~20% in vm_normal_folio().
> > > >
> > > > With batching, the proportion of vm_normal_folio() increases to over
> > > > 70% of move_pages_pte() without any changes to vm_normal_folio().
> > > > Furthermore, time spent within move_pages_pte() is only ~20%, which
> > > > includes TLB-flush overhead.
> > > >
> > > > Cc: Suren Baghdasaryan <surenb@google.com>
> > > > Cc: Kalesh Singh <kaleshsingh@google.com>
> > > > Cc: Barry Song <v-songbaohua@oppo.com>
> > > > Cc: David Hildenbrand <david@redhat.com>
> > > > Cc: Peter Xu <peterx@redhat.com>
> > > > Signed-off-by: Lokesh Gidra <lokeshgidra@google.com>
> > > > ---
> > > >  mm/userfaultfd.c | 179 +++++++++++++++++++++++++++++++++--------------
> > > >  1 file changed, 127 insertions(+), 52 deletions(-)
> > > >
> > > > diff --git a/mm/userfaultfd.c b/mm/userfaultfd.c
> > > > index 8253978ee0fb..2465fb234671 100644
> > > > --- a/mm/userfaultfd.c
> > > > +++ b/mm/userfaultfd.c
> > > > @@ -1026,18 +1026,62 @@ static inline bool is_pte_pages_stable(pte_t *dst_pte, pte_t *src_pte,
> > > >                pmd_same(dst_pmdval, pmdp_get_lockless(dst_pmd));
> > > >  }
> > > >
> > > > -static int move_present_pte(struct mm_struct *mm,
> > > > -                           struct vm_area_struct *dst_vma,
> > > > -                           struct vm_area_struct *src_vma,
> > > > -                           unsigned long dst_addr, unsigned long src_addr,
> > > > -                           pte_t *dst_pte, pte_t *src_pte,
> > > > -                           pte_t orig_dst_pte, pte_t orig_src_pte,
> > > > -                           pmd_t *dst_pmd, pmd_t dst_pmdval,
> > > > -                           spinlock_t *dst_ptl, spinlock_t *src_ptl,
> > > > -                           struct folio *src_folio)
> > > > +/*
> > > > + * Checks if the two ptes and the corresponding folio are eligible for batched
> > > > + * move. If so, then returns pointer to the folio, after locking it. Otherwise,
> > > > + * returns NULL.
> > > > + */
> > > > +static struct folio *check_ptes_for_batched_move(struct vm_area_struct *src_vma,
> > > > +                                                unsigned long src_addr,
> > > > +                                                pte_t *src_pte, pte_t *dst_pte)
> > > > +{
> > > > +       pte_t orig_dst_pte, orig_src_pte;
> > > > +       struct folio *folio;
> > > > +
> > > > +       orig_dst_pte = ptep_get(dst_pte);
> > > > +       if (!pte_none(orig_dst_pte))
> > > > +               return NULL;
> > > > +
> > > > +       orig_src_pte = ptep_get(src_pte);
> > > > +       if (pte_none(orig_src_pte))
> > > > +               return NULL;
> > > > +       if (!pte_present(orig_src_pte) || is_zero_pfn(pte_pfn(orig_src_pte)))
> > > > +               return NULL;
> > > > +
> > > > +       folio = vm_normal_folio(src_vma, src_addr, orig_src_pte);
> > > > +       if (!folio || !folio_trylock(folio))
> > > > +               return NULL;
> > > > +       if (!PageAnonExclusive(&folio->page) || folio_test_large(folio)) {
> > > > +               folio_unlock(folio);
> > > > +               return NULL;
> > > > +       }
> > > > +       return folio;
> > > > +}
> > > > +
> > > > +static long move_present_ptes(struct mm_struct *mm,
> > > > +                             struct vm_area_struct *dst_vma,
> > > > +                             struct vm_area_struct *src_vma,
> > > > +                             unsigned long dst_addr, unsigned long src_addr,
> > > > +                             pte_t *dst_pte, pte_t *src_pte,
> > > > +                             pte_t orig_dst_pte, pte_t orig_src_pte,
> > > > +                             pmd_t *dst_pmd, pmd_t dst_pmdval,
> > > > +                             spinlock_t *dst_ptl, spinlock_t *src_ptl,
> > > > +                             struct folio *src_folio, unsigned long len)
> > > >  {
> > > >         int err = 0;
> > > > +       unsigned long src_start = src_addr;
> > > > +       unsigned long addr_end;
> > > > +
> > > > +       if (len > PAGE_SIZE) {
> > > > +               addr_end = (dst_addr + PMD_SIZE) & PMD_MASK;
> > > > +               if (dst_addr + len > addr_end)
> > > > +                       len = addr_end - dst_addr;
> > > >
> > > > +               addr_end = (src_addr + PMD_SIZE) & PMD_MASK;
> > > > +               if (src_addr + len > addr_end)
> > > > +                       len = addr_end - src_addr;
> > > > +       }
> > > > +       flush_cache_range(src_vma, src_addr, src_addr + len);
> > > >         double_pt_lock(dst_ptl, src_ptl);
> > > >
> > > >         if (!is_pte_pages_stable(dst_pte, src_pte, orig_dst_pte, orig_src_pte,
> > > > @@ -1051,31 +1095,60 @@ static int move_present_pte(struct mm_struct *mm,
> > > >                 err = -EBUSY;
> > > >                 goto out;
> > > >         }
> > > > +       /* Avoid batching overhead for single page case */
> > > > +       if (len > PAGE_SIZE) {
> > > > +               flush_tlb_batched_pending(mm);
> > >
> > > What’s confusing to me is that they track the unmapping of multiple
> > > consecutive PTEs and defer TLB invalidation until later.
> > > In contrast, you’re not tracking anything and instead call
> > > flush_tlb_range() directly, which triggers the flush immediately.
> > >
> > > It seems you might be combining two different batching approaches.
> >
> > These changes I made are in line with how mremap() does batching. See
> > move_ptes() in mm/mremap.c
> >
> > From the comment in flush_tlb_batched_pending() [1] it seems necessary
> > in this case too. Please correct me if I'm wrong. I'll be happy to
> > remove it if it's not required.
> >
> > [1] https://elixir.bootlin.com/linux/v6.16/source/mm/rmap.c#L728
>
> Whether we need flush_tlb_batched_pending() has nothing to do with your
> patch. It's entirely about synchronizing with other pending TLBIs, such as
> those from try_to_unmap_one() and try_to_migrate_one().
>
> In short, if it's needed, it's needed regardless of whether your patch is
> applied or whether you're dealing with len > PAGE_SIZE.
>
> >
> > > From what I can tell, you're essentially using flush_range
> > > as a replacement for flushing each entry individually.
> >
> > That's correct. The idea is to reduce the number of IPIs required for
> > flushing the TLB entries. Since it is quite common that the ioctl is
> > invoked with several pages in one go, this greatly benefits.
> >
> > >
> > > > +               arch_enter_lazy_mmu_mode();
> > > > +               orig_src_pte = ptep_get_and_clear(mm, src_addr, src_pte);
> > > > +       } else
> > > > +               orig_src_pte = ptep_clear_flush(src_vma, src_addr, src_pte);
> > > > +
> > > > +       addr_end = src_start + len;
> > > > +       do {
> > > > +               /* Folio got pinned from under us. Put it back and fail the move. */
> > > > +               if (folio_maybe_dma_pinned(src_folio)) {
> > > > +                       set_pte_at(mm, src_addr, src_pte, orig_src_pte);
> > > > +                       err = -EBUSY;
> > > > +                       break;
> > > > +               }
> > > >
> > > > -       orig_src_pte = ptep_clear_flush(src_vma, src_addr, src_pte);
> > > > -       /* Folio got pinned from under us. Put it back and fail the move. */
> > > > -       if (folio_maybe_dma_pinned(src_folio)) {
> > > > -               set_pte_at(mm, src_addr, src_pte, orig_src_pte);
> > > > -               err = -EBUSY;
> > > > -               goto out;
> > > > -       }
> > > > -
> > > > -       folio_move_anon_rmap(src_folio, dst_vma);
> > > > -       src_folio->index = linear_page_index(dst_vma, dst_addr);
> > > > +               folio_move_anon_rmap(src_folio, dst_vma);
> > > > +               src_folio->index = linear_page_index(dst_vma, dst_addr);
> > > >
> > > > -       orig_dst_pte = folio_mk_pte(src_folio, dst_vma->vm_page_prot);
> > > > -       /* Set soft dirty bit so userspace can notice the pte was moved */
> > > > +               orig_dst_pte = folio_mk_pte(src_folio, dst_vma->vm_page_prot);
> > > > +               /* Set soft dirty bit so userspace can notice the pte was moved */
> > > >  #ifdef CONFIG_MEM_SOFT_DIRTY
> > > > -       orig_dst_pte = pte_mksoft_dirty(orig_dst_pte);
> > > > +               orig_dst_pte = pte_mksoft_dirty(orig_dst_pte);
> > > >  #endif
> > > > -       if (pte_dirty(orig_src_pte))
> > > > -               orig_dst_pte = pte_mkdirty(orig_dst_pte);
> > > > -       orig_dst_pte = pte_mkwrite(orig_dst_pte, dst_vma);
> > > > +               if (pte_dirty(orig_src_pte))
> > > > +                       orig_dst_pte = pte_mkdirty(orig_dst_pte);
> > > > +               orig_dst_pte = pte_mkwrite(orig_dst_pte, dst_vma);
> > > > +               set_pte_at(mm, dst_addr, dst_pte, orig_dst_pte);
> > > > +
> > > > +               src_addr += PAGE_SIZE;
> > > > +               if (src_addr == addr_end)
> > > > +                       break;
> > > > +               src_pte++;
> > > > +               dst_pte++;
> > > >
> > > > -       set_pte_at(mm, dst_addr, dst_pte, orig_dst_pte);
> > > > +               folio_unlock(src_folio);
> > > > +               src_folio = check_ptes_for_batched_move(src_vma, src_addr, src_pte, dst_pte);
> > > > +               if (!src_folio)
> > > > +                       break;
> > > > +               orig_src_pte = ptep_get_and_clear(mm, src_addr, src_pte);
> > > > +               dst_addr += PAGE_SIZE;
> > > > +       } while (true);
> > > > +
> > > > +       if (len > PAGE_SIZE) {
> > > > +               arch_leave_lazy_mmu_mode();
> > > > +               if (src_addr > src_start)
> > > > +                       flush_tlb_range(src_vma, src_start, src_addr);
> > > > +       }
> > >
> > > Can't we just remove the `if (len > PAGE_SIZE)` check and unify the
> > > handling for both single-page and multi-page cases?
> >
> > We certainly can. Initially it seemed to me that lazy/batched
> > invalidation has its own overhead and I wanted to avoid that in the
> > single-page case because the ioctl does get called for single pages
> > quite a bit. That too in time sensitive code paths. However, on a
> > deeper relook now, I noticed it's not really that different.
> >
> > I'll unify in the next patch. Thanks for the suggestion.
>
> Yes, that would be nice — especially since flush_tlb_batched_pending()
> is not needed in this patch.
>
> Whether it's needed for uffd_move is a separate matter and should be
> addressed in a separate patch, if necessary — for example, if there's a
> similar race as described in Commit 3ea277194daa
> ("mm, mprotect: flush TLB if potentially racing with a parallel reclaim
> leaving stale TLB entries").
>
>
>
>             CPU0                            CPU1
>
>             ----                            ----
>
>                                             user accesses memory using RW PTE
>
>                                             [PTE now cached in TLB]
>
>             try_to_unmap_one()
>
>             ==> ptep_get_and_clear()
>
>             ==> set_tlb_ubc_flush_pending()
>
>                                             mprotect(addr, PROT_READ)
>
>                                             ==> change_pte_range()
>
>                                             ==> [ PTE non-present - no flush ]
>
>
>
>                                             user writes using cached RW PTE
>
>             ...
>
>             try_to_unmap_flush()
>
Thanks so much for clarifying. I understand the issue now. I'll think
about whether we need it in userfaultfd_move or not. But as you said
that is for a separate patch, if needed.

I'll upload v2 in some time.

> Thanks
> Barry


^ permalink raw reply	[flat|nested] 5+ messages in thread

end of thread, other threads:[~2025-08-05 10:36 UTC | newest]

Thread overview: 5+ messages (download: mbox.gz follow: Atom feed
-- links below jump to the message on this page --
2025-07-31 10:47 [PATCH] userfaultfd: opportunistic TLB-flush batching for present pages in MOVE Lokesh Gidra
2025-08-05  4:35 ` Barry Song
2025-08-05  6:30   ` Lokesh Gidra
2025-08-05 10:21     ` Barry Song
2025-08-05 10:36       ` Lokesh Gidra

This is a public inbox, see mirroring instructions
for how to clone and mirror all data and code used for this inbox;
as well as URLs for NNTP newsgroup(s).