+ mm-shmem-swap-avoid-false-positive-swap-cache-lookup.patch added to mm-new branch

All of lore.kernel.org
 help / color / mirror / Atom feed

* + mm-shmem-swap-avoid-false-positive-swap-cache-lookup.patch added to mm-new branch
@ 2025-06-17 23:03 Andrew Morton
  0 siblings, 0 replies; 3+ messages in thread
From: Andrew Morton @ 2025-06-17 23:03 UTC (permalink / raw)
  To: mm-commits, willy, shikemeng, nphamcs, hughd, chrisl, bhe,
	baolin.wang, baohua, kasong, akpm


The patch titled
     Subject: mm/shmem, swap: avoid false positive swap cache lookup
has been added to the -mm mm-new branch.  Its filename is
     mm-shmem-swap-avoid-false-positive-swap-cache-lookup.patch

This patch will shortly appear at
     https://git.kernel.org/pub/scm/linux/kernel/git/akpm/25-new.git/tree/patches/mm-shmem-swap-avoid-false-positive-swap-cache-lookup.patch

This patch will later appear in the mm-new branch at
    git://git.kernel.org/pub/scm/linux/kernel/git/akpm/mm

Note, mm-new is a provisional staging ground for work-in-progress
patches, and acceptance into mm-new is a notification for others take
notice and to finish up reviews.  Please do not hesitate to respond to
review feedback and post updated versions to replace or incrementally
fixup patches in mm-new.

Before you just go and hit "reply", please:
   a) Consider who else should be cc'ed
   b) Prefer to cc a suitable mailing list as well
   c) Ideally: find the original patch on the mailing list and do a
      reply-to-all to that, adding suitable additional cc's

*** Remember to use Documentation/process/submit-checklist.rst when testing your code ***

The -mm tree is included into linux-next via the mm-everything
branch at git://git.kernel.org/pub/scm/linux/kernel/git/akpm/mm
and is updated there every 2-3 working days

------------------------------------------------------
From: Kairui Song <kasong@tencent.com>
Subject: mm/shmem, swap: avoid false positive swap cache lookup
Date: Wed, 18 Jun 2025 02:35:03 +0800

If the shmem read request's index points to the middle of a large swap
entry, shmem swap in does the swap cache lookup use the large swap entry's
starting value (the first sub swap entry of this large entry).  This will
lead to false positive lookup result if only the first few swap entries
are cached, but the requested swap entry pointed by index is uncached.

Currently shmem will do a large entry split then retry the swapin from
beginning, which is a waste of CPU and fragile.  Handle this correctly.

Also add some sanity checks to help understand the code and ensure things
won't go wrong.

Link: https://lkml.kernel.org/r/20250617183503.10527-5-ryncsn@gmail.com
Signed-off-by: Kairui Song <kasong@tencent.com>
Cc: Baolin Wang <baolin.wang@linux.alibaba.com>
Cc: Baoquan He <bhe@redhat.com>
Cc: Barry Song <baohua@kernel.org>
Cc: Chris Li <chrisl@kernel.org>
Cc: Hugh Dickins <hughd@google.com>
Cc: Kemeng Shi <shikemeng@huaweicloud.com>
Cc: Matthew Wilcox (Oracle) <willy@infradead.org>
Cc: Nhat Pham <nphamcs@gmail.com>
Signed-off-by: Andrew Morton <akpm@linux-foundation.org>
---

 mm/shmem.c |   61 ++++++++++++++++++++++++---------------------------
 1 file changed, 29 insertions(+), 32 deletions(-)

--- a/mm/shmem.c~mm-shmem-swap-avoid-false-positive-swap-cache-lookup
+++ a/mm/shmem.c
@@ -1977,12 +1977,12 @@ unlock:
 
 static struct folio *shmem_swapin_direct(struct inode *inode,
 		struct vm_area_struct *vma, pgoff_t index,
-		swp_entry_t entry, int *order, gfp_t gfp)
+		swp_entry_t swap_entry, swp_entry_t swap,
+		int *order, gfp_t gfp)
 {
 	struct shmem_inode_info *info = SHMEM_I(inode);
 	int nr_pages = 1 << *order;
 	struct folio *new;
-	pgoff_t offset;
 	void *shadow;
 
 	/*
@@ -2003,13 +2003,11 @@ static struct folio *shmem_swapin_direct
 		 */
 		if ((vma && userfaultfd_armed(vma)) ||
 		    !zswap_never_enabled() ||
-		    non_swapcache_batch(entry, nr_pages) != nr_pages) {
-			offset = index - round_down(index, nr_pages);
-			entry = swp_entry(swp_type(entry),
-					  swp_offset(entry) + offset);
+		    non_swapcache_batch(swap_entry, nr_pages) != nr_pages) {
 			*order = 0;
 			nr_pages = 1;
 		} else {
+			swap.val = swap_entry.val;
 			gfp_t huge_gfp = vma_thp_gfp_mask(vma);
 
 			gfp = limit_gfp_mask(huge_gfp, gfp);
@@ -2021,7 +2019,7 @@ static struct folio *shmem_swapin_direct
 		return ERR_PTR(-ENOMEM);
 
 	if (mem_cgroup_swapin_charge_folio(new, vma ? vma->vm_mm : NULL,
-					   gfp, entry)) {
+					   gfp, swap)) {
 		folio_put(new);
 		return ERR_PTR(-ENOMEM);
 	}
@@ -2036,17 +2034,17 @@ static struct folio *shmem_swapin_direct
 	 * In this case, shmem_add_to_page_cache() will help identify the
 	 * concurrent swapin and return -EEXIST.
 	 */
-	if (swapcache_prepare(entry, nr_pages)) {
+	if (swapcache_prepare(swap, nr_pages)) {
 		folio_put(new);
 		return ERR_PTR(-EEXIST);
 	}
 
 	__folio_set_locked(new);
 	__folio_set_swapbacked(new);
-	new->swap = entry;
+	new->swap = swap;
 
-	memcg1_swapin(entry, nr_pages);
-	shadow = get_shadow_from_swap_cache(entry);
+	memcg1_swapin(swap, nr_pages);
+	shadow = get_shadow_from_swap_cache(swap);
 	if (shadow)
 		workingset_refault(new, shadow);
 	folio_add_lru(new);
@@ -2278,20 +2276,21 @@ static int shmem_swapin_folio(struct ino
 	struct mm_struct *fault_mm = vma ? vma->vm_mm : NULL;
 	struct shmem_inode_info *info = SHMEM_I(inode);
 	int error, nr_pages, order, swap_order;
+	swp_entry_t swap, swap_entry;
 	struct swap_info_struct *si;
 	struct folio *folio = NULL;
 	bool skip_swapcache = false;
-	swp_entry_t swap;
+	pgoff_t offset;
 
 	VM_BUG_ON(!*foliop || !xa_is_value(*foliop));
-	swap = radix_to_swp_entry(*foliop);
+	swap_entry = radix_to_swp_entry(*foliop);
 	*foliop = NULL;
 
-	if (is_poisoned_swp_entry(swap))
+	if (is_poisoned_swp_entry(swap_entry))
 		return -EIO;
 
-	si = get_swap_device(swap);
-	order = shmem_swap_check_entry(mapping, index, swap);
+	si = get_swap_device(swap_entry);
+	order = shmem_swap_check_entry(mapping, index, swap_entry);
 	if (unlikely(!si)) {
 		if (order < 0)
 			return -EEXIST;
@@ -2303,7 +2302,9 @@ static int shmem_swapin_folio(struct ino
 		return -EEXIST;
 	}
 
-	/* Look it up and read it in.. */
+	/* @index may points to the middle of a large entry, get the real swap value first */
+	offset = index - round_down(index, 1 << order);
+	swap.val = swap_entry.val + offset;
 	folio = swap_cache_get_folio(swap, NULL, 0);
 	if (!folio) {
 		/* Or update major stats only when swapin succeeds?? */
@@ -2315,7 +2316,7 @@ static int shmem_swapin_folio(struct ino
 		/* Try direct mTHP swapin bypassing swap cache and readahead */
 		if (data_race(si->flags & SWP_SYNCHRONOUS_IO)) {
 			swap_order = order;
-			folio = shmem_swapin_direct(inode, vma, index,
+			folio = shmem_swapin_direct(inode, vma, index, swap_entry,
 						    swap, &swap_order, gfp);
 			if (!IS_ERR(folio)) {
 				skip_swapcache = true;
@@ -2338,28 +2339,25 @@ static int shmem_swapin_folio(struct ino
 		}
 	}
 alloced:
+	swap_order = folio_order(folio);
+	nr_pages = folio_nr_pages(folio);
+
+	/* The swap-in should cover both @swap and @index */
+	swap.val = round_down(swap.val, nr_pages);
+	VM_WARN_ON_ONCE(swap.val > swap_entry.val + offset);
+	VM_WARN_ON_ONCE(swap.val + nr_pages <= swap_entry.val + offset);
+
 	/*
 	 * We need to split an existing large entry if swapin brought in a
 	 * smaller folio due to various of reasons.
-	 *
-	 * And worth noting there is a special case: if there is a smaller
-	 * cached folio that covers @swap, but not @index (it only covers
-	 * first few sub entries of the large entry, but @index points to
-	 * later parts), the swap cache lookup will still see this folio,
-	 * And we need to split the large entry here. Later checks will fail,
-	 * as it can't satisfy the swap requirement, and we will retry
-	 * the swapin from beginning.
 	 */
-	swap_order = folio_order(folio);
+	index = round_down(index, nr_pages);
 	if (order > swap_order) {
-		error = shmem_split_swap_entry(inode, index, swap, gfp);
+		error = shmem_split_swap_entry(inode, index, swap_entry, gfp);
 		if (error)
 			goto failed_nolock;
 	}
 
-	index = round_down(index, 1 << swap_order);
-	swap.val = round_down(swap.val, 1 << swap_order);
-
 	/* We have to do this with folio locked to prevent races */
 	folio_lock(folio);
 	if ((!skip_swapcache && !folio_test_swapcache(folio)) ||
@@ -2372,7 +2370,6 @@ alloced:
 		goto failed;
 	}
 	folio_wait_writeback(folio);
-	nr_pages = folio_nr_pages(folio);
 
 	/*
 	 * Some architectures may have to restore extra metadata to the
_

Patches currently in -mm which might be from kasong@tencent.com are

mm-shmem-swap-fix-softlockup-with-mthp-swapin.patch
mm-shmem-swap-fix-softlockup-with-mthp-swapin-v3.patch
mm-userfaultfd-fix-race-of-userfaultfd_move-and-swap-cache.patch
mm-list_lru-refactor-the-locking-code.patch
mm-shmem-swap-improve-cached-mthp-handling-and-fix-potential-hung.patch
mm-shmem-swap-avoid-redundant-xarray-lookup-during-swapin.patch
mm-shmem-swap-improve-mthp-swapin-process.patch
mm-shmem-swap-avoid-false-positive-swap-cache-lookup.patch


^ permalink raw reply	[flat|nested] 3+ messages in thread

* + mm-shmem-swap-avoid-false-positive-swap-cache-lookup.patch added to mm-new branch
@ 2025-06-20  0:40 Andrew Morton
  0 siblings, 0 replies; 3+ messages in thread
From: Andrew Morton @ 2025-06-20  0:40 UTC (permalink / raw)
  To: mm-commits, willy, shikemeng, nphamcs, hughd, chrisl, bhe,
	baolin.wang, baohua, kasong, akpm


The patch titled
     Subject: mm/shmem, swap: avoid false positive swap cache lookup
has been added to the -mm mm-new branch.  Its filename is
     mm-shmem-swap-avoid-false-positive-swap-cache-lookup.patch

This patch will shortly appear at
     https://git.kernel.org/pub/scm/linux/kernel/git/akpm/25-new.git/tree/patches/mm-shmem-swap-avoid-false-positive-swap-cache-lookup.patch

This patch will later appear in the mm-new branch at
    git://git.kernel.org/pub/scm/linux/kernel/git/akpm/mm

Note, mm-new is a provisional staging ground for work-in-progress
patches, and acceptance into mm-new is a notification for others take
notice and to finish up reviews.  Please do not hesitate to respond to
review feedback and post updated versions to replace or incrementally
fixup patches in mm-new.

Before you just go and hit "reply", please:
   a) Consider who else should be cc'ed
   b) Prefer to cc a suitable mailing list as well
   c) Ideally: find the original patch on the mailing list and do a
      reply-to-all to that, adding suitable additional cc's

*** Remember to use Documentation/process/submit-checklist.rst when testing your code ***

The -mm tree is included into linux-next via the mm-everything
branch at git://git.kernel.org/pub/scm/linux/kernel/git/akpm/mm
and is updated there every 2-3 working days

------------------------------------------------------
From: Kairui Song <kasong@tencent.com>
Subject: mm/shmem, swap: avoid false positive swap cache lookup
Date: Fri, 20 Jun 2025 01:55:38 +0800

If the shmem read request's index points to the middle of a large swap
entry, shmem swap in does the swap cache lookup use the large swap entry's
starting value (the first sub swap entry of this large entry).  This will
lead to false positive lookup result if only the first few swap entries
are cached, but the requested swap entry pointed by index is uncached.

Currently shmem will do a large entry split then retry the swapin from
beginning, which is a waste of CPU and fragile.  Handle this correctly.

Also add some sanity checks to help understand the code and ensure things
won't go wrong.

Link: https://lkml.kernel.org/r/20250619175538.15799-5-ryncsn@gmail.com
Signed-off-by: Kairui Song <kasong@tencent.com>
Reviewed-by: Kemeng Shi <shikemeng@huaweicloud.com>
Cc: Baolin Wang <baolin.wang@linux.alibaba.com>
Cc: Baoquan He <bhe@redhat.com>
Cc: Barry Song <baohua@kernel.org>
Cc: Chris Li <chrisl@kernel.org>
Cc: Hugh Dickins <hughd@google.com>
Cc: Matthew Wilcox (Oracle) <willy@infradead.org>
Cc: Nhat Pham <nphamcs@gmail.com>
Signed-off-by: Andrew Morton <akpm@linux-foundation.org>
---

 mm/shmem.c |   61 ++++++++++++++++++++++++---------------------------
 1 file changed, 29 insertions(+), 32 deletions(-)

--- a/mm/shmem.c~mm-shmem-swap-avoid-false-positive-swap-cache-lookup
+++ a/mm/shmem.c
@@ -1977,12 +1977,12 @@ unlock:
 
 static struct folio *shmem_swapin_direct(struct inode *inode,
 		struct vm_area_struct *vma, pgoff_t index,
-		swp_entry_t entry, int *order, gfp_t gfp)
+		swp_entry_t index_entry, swp_entry_t swap,
+		int *order, gfp_t gfp)
 {
 	struct shmem_inode_info *info = SHMEM_I(inode);
 	int nr_pages = 1 << *order;
 	struct folio *new;
-	pgoff_t offset;
 	void *shadow;
 
 	/*
@@ -2003,13 +2003,11 @@ static struct folio *shmem_swapin_direct
 		 */
 		if ((vma && userfaultfd_armed(vma)) ||
 		    !zswap_never_enabled() ||
-		    non_swapcache_batch(entry, nr_pages) != nr_pages) {
-			offset = index - round_down(index, nr_pages);
-			entry = swp_entry(swp_type(entry),
-					  swp_offset(entry) + offset);
+		    non_swapcache_batch(index_entry, nr_pages) != nr_pages) {
 			*order = 0;
 			nr_pages = 1;
 		} else {
+			swap.val = index_entry.val;
 			gfp_t huge_gfp = vma_thp_gfp_mask(vma);
 
 			gfp = limit_gfp_mask(huge_gfp, gfp);
@@ -2021,7 +2019,7 @@ static struct folio *shmem_swapin_direct
 		return ERR_PTR(-ENOMEM);
 
 	if (mem_cgroup_swapin_charge_folio(new, vma ? vma->vm_mm : NULL,
-					   gfp, entry)) {
+					   gfp, swap)) {
 		folio_put(new);
 		return ERR_PTR(-ENOMEM);
 	}
@@ -2036,17 +2034,17 @@ static struct folio *shmem_swapin_direct
 	 * In this case, shmem_add_to_page_cache() will help identify the
 	 * concurrent swapin and return -EEXIST.
 	 */
-	if (swapcache_prepare(entry, nr_pages)) {
+	if (swapcache_prepare(swap, nr_pages)) {
 		folio_put(new);
 		return ERR_PTR(-EEXIST);
 	}
 
 	__folio_set_locked(new);
 	__folio_set_swapbacked(new);
-	new->swap = entry;
+	new->swap = swap;
 
-	memcg1_swapin(entry, nr_pages);
-	shadow = get_shadow_from_swap_cache(entry);
+	memcg1_swapin(swap, nr_pages);
+	shadow = get_shadow_from_swap_cache(swap);
 	if (shadow)
 		workingset_refault(new, shadow);
 	folio_add_lru(new);
@@ -2278,20 +2276,21 @@ static int shmem_swapin_folio(struct ino
 	struct mm_struct *fault_mm = vma ? vma->vm_mm : NULL;
 	struct shmem_inode_info *info = SHMEM_I(inode);
 	int error, nr_pages, order, swap_order;
+	swp_entry_t swap, index_entry;
 	struct swap_info_struct *si;
 	struct folio *folio = NULL;
 	bool skip_swapcache = false;
-	swp_entry_t swap;
+	pgoff_t offset;
 
 	VM_BUG_ON(!*foliop || !xa_is_value(*foliop));
-	swap = radix_to_swp_entry(*foliop);
+	index_entry = radix_to_swp_entry(*foliop);
 	*foliop = NULL;
 
-	if (is_poisoned_swp_entry(swap))
+	if (is_poisoned_swp_entry(index_entry))
 		return -EIO;
 
-	si = get_swap_device(swap);
-	order = shmem_confirm_swap(mapping, index, swap);
+	si = get_swap_device(index_entry);
+	order = shmem_confirm_swap(mapping, index, index_entry);
 	if (unlikely(!si)) {
 		if (order < 0)
 			return -EEXIST;
@@ -2303,7 +2302,9 @@ static int shmem_swapin_folio(struct ino
 		return -EEXIST;
 	}
 
-	/* Look it up and read it in.. */
+	/* @index may points to the middle of a large entry, get the real swap value first */
+	offset = index - round_down(index, 1 << order);
+	swap.val = index_entry.val + offset;
 	folio = swap_cache_get_folio(swap, NULL, 0);
 	if (!folio) {
 		/* Or update major stats only when swapin succeeds?? */
@@ -2315,7 +2316,7 @@ static int shmem_swapin_folio(struct ino
 		/* Try direct mTHP swapin bypassing swap cache and readahead */
 		if (data_race(si->flags & SWP_SYNCHRONOUS_IO)) {
 			swap_order = order;
-			folio = shmem_swapin_direct(inode, vma, index,
+			folio = shmem_swapin_direct(inode, vma, index, index_entry,
 						    swap, &swap_order, gfp);
 			if (!IS_ERR(folio)) {
 				skip_swapcache = true;
@@ -2338,28 +2339,25 @@ static int shmem_swapin_folio(struct ino
 		}
 	}
 alloced:
+	swap_order = folio_order(folio);
+	nr_pages = folio_nr_pages(folio);
+
+	/* The swap-in should cover both @swap and @index */
+	swap.val = round_down(swap.val, nr_pages);
+	VM_WARN_ON_ONCE(swap.val > index_entry.val + offset);
+	VM_WARN_ON_ONCE(swap.val + nr_pages <= index_entry.val + offset);
+
 	/*
 	 * We need to split an existing large entry if swapin brought in a
 	 * smaller folio due to various of reasons.
-	 *
-	 * And worth noting there is a special case: if there is a smaller
-	 * cached folio that covers @swap, but not @index (it only covers
-	 * first few sub entries of the large entry, but @index points to
-	 * later parts), the swap cache lookup will still see this folio,
-	 * And we need to split the large entry here. Later checks will fail,
-	 * as it can't satisfy the swap requirement, and we will retry
-	 * the swapin from beginning.
 	 */
-	swap_order = folio_order(folio);
+	index = round_down(index, nr_pages);
 	if (order > swap_order) {
-		error = shmem_split_swap_entry(inode, index, swap, gfp);
+		error = shmem_split_swap_entry(inode, index, index_entry, gfp);
 		if (error)
 			goto failed_nolock;
 	}
 
-	index = round_down(index, 1 << swap_order);
-	swap.val = round_down(swap.val, 1 << swap_order);
-
 	/* We have to do this with folio locked to prevent races */
 	folio_lock(folio);
 	if ((!skip_swapcache && !folio_test_swapcache(folio)) ||
@@ -2372,7 +2370,6 @@ alloced:
 		goto failed;
 	}
 	folio_wait_writeback(folio);
-	nr_pages = folio_nr_pages(folio);
 
 	/*
 	 * Some architectures may have to restore extra metadata to the
_

Patches currently in -mm which might be from kasong@tencent.com are

mm-shmem-swap-fix-softlockup-with-mthp-swapin.patch
mm-shmem-swap-fix-softlockup-with-mthp-swapin-v3.patch
mm-userfaultfd-fix-race-of-userfaultfd_move-and-swap-cache.patch
mm-list_lru-refactor-the-locking-code.patch
mm-shmem-swap-improve-cached-mthp-handling-and-fix-potential-hung.patch
mm-shmem-swap-avoid-redundant-xarray-lookup-during-swapin.patch
mm-shmem-swap-improve-mthp-swapin-process.patch
mm-shmem-swap-avoid-false-positive-swap-cache-lookup.patch


^ permalink raw reply	[flat|nested] 3+ messages in thread

* + mm-shmem-swap-avoid-false-positive-swap-cache-lookup.patch added to mm-new branch
@ 2025-06-27 20:16 Andrew Morton
  0 siblings, 0 replies; 3+ messages in thread
From: Andrew Morton @ 2025-06-27 20:16 UTC (permalink / raw)
  To: mm-commits, willy, shikemeng, nphamcs, hughd, dev.jain, chrisl,
	bhe, baolin.wang, baohua, kasong, akpm


The patch titled
     Subject: mm/shmem, swap: avoid false positive swap cache lookup
has been added to the -mm mm-new branch.  Its filename is
     mm-shmem-swap-avoid-false-positive-swap-cache-lookup.patch

This patch will shortly appear at
     https://git.kernel.org/pub/scm/linux/kernel/git/akpm/25-new.git/tree/patches/mm-shmem-swap-avoid-false-positive-swap-cache-lookup.patch

This patch will later appear in the mm-new branch at
    git://git.kernel.org/pub/scm/linux/kernel/git/akpm/mm

Note, mm-new is a provisional staging ground for work-in-progress
patches, and acceptance into mm-new is a notification for others take
notice and to finish up reviews.  Please do not hesitate to respond to
review feedback and post updated versions to replace or incrementally
fixup patches in mm-new.

Before you just go and hit "reply", please:
   a) Consider who else should be cc'ed
   b) Prefer to cc a suitable mailing list as well
   c) Ideally: find the original patch on the mailing list and do a
      reply-to-all to that, adding suitable additional cc's

*** Remember to use Documentation/process/submit-checklist.rst when testing your code ***

The -mm tree is included into linux-next via the mm-everything
branch at git://git.kernel.org/pub/scm/linux/kernel/git/akpm/mm
and is updated there every 2-3 working days

------------------------------------------------------
From: Kairui Song <kasong@tencent.com>
Subject: mm/shmem, swap: avoid false positive swap cache lookup
Date: Fri, 27 Jun 2025 14:20:20 +0800

If the shmem read request's index points to the middle of a large swap
entry, shmem swap in does the swap cache lookup use the large swap entry's
starting value (the first sub swap entry of this large entry).  This will
lead to false positive lookup result if only the first few swap entries
are cached, but the requested swap entry pointed by index is uncached.

Currently shmem will do a large entry split then retry the swapin from
beginning, which is a waste of CPU and fragile.  Handle this correctly.

Also add some sanity checks to help understand the code and ensure things
won't go wrong.

Link: https://lkml.kernel.org/r/20250627062020.534-8-ryncsn@gmail.com
Signed-off-by: Kairui Song <kasong@tencent.com>
Cc: Baolin Wang <baolin.wang@linux.alibaba.com>
Cc: Baoquan He <bhe@redhat.com>
Cc: Barry Song <baohua@kernel.org>
Cc: Chris Li <chrisl@kernel.org>
Cc: Dev Jain <dev.jain@arm.com>
Cc: Hugh Dickins <hughd@google.com>
Cc: Kemeng Shi <shikemeng@huaweicloud.com>
Cc: Matthew Wilcox (Oracle) <willy@infradead.org>
Cc: Nhat Pham <nphamcs@gmail.com>
Signed-off-by: Andrew Morton <akpm@linux-foundation.org>
---

 mm/shmem.c |   60 +++++++++++++++++++++++++++------------------------
 1 file changed, 32 insertions(+), 28 deletions(-)

--- a/mm/shmem.c~mm-shmem-swap-avoid-false-positive-swap-cache-lookup
+++ a/mm/shmem.c
@@ -1977,14 +1977,19 @@ unlock:
 
 static struct folio *shmem_swapin_direct(struct inode *inode,
 		struct vm_area_struct *vma, pgoff_t index,
-		swp_entry_t entry, int order, gfp_t gfp)
+		swp_entry_t index_entry, swp_entry_t swap,
+		int order, gfp_t gfp)
 {
 	struct shmem_inode_info *info = SHMEM_I(inode);
-	int nr_pages = 1 << order;
 	struct folio *new;
-	pgoff_t offset;
+	swp_entry_t entry;
 	gfp_t swap_gfp;
 	void *shadow;
+	int nr_pages;
+
+	/* Prefer aligned THP swapin */
+	entry.val = index_entry.val;
+	nr_pages = 1 << order;
 
 	/*
 	 * We have arrived here because our zones are constrained, so don't
@@ -2011,6 +2016,7 @@ static struct folio *shmem_swapin_direct
 			swap_gfp = limit_gfp_mask(vma_thp_gfp_mask(vma), gfp);
 		}
 	}
+
 retry:
 	new = shmem_alloc_folio(swap_gfp, order, info, index);
 	if (!new) {
@@ -2056,11 +2062,10 @@ fallback:
 	if (!order)
 		return new;
 	/* High order swapin failed, fallback to order 0 and retry */
-	order = 0;
-	nr_pages = 1;
+	entry.val = swap.val;
 	swap_gfp = gfp;
-	offset = index - round_down(index, nr_pages);
-	entry = swp_entry(swp_type(entry), swp_offset(entry) + offset);
+	nr_pages = 1;
+	order = 0;
 	goto retry;
 }
 
@@ -2288,20 +2293,21 @@ static int shmem_swapin_folio(struct ino
 	struct mm_struct *fault_mm = vma ? vma->vm_mm : NULL;
 	struct shmem_inode_info *info = SHMEM_I(inode);
 	int error, nr_pages, order, swap_order;
+	swp_entry_t swap, index_entry;
 	struct swap_info_struct *si;
 	struct folio *folio = NULL;
 	bool skip_swapcache = false;
-	swp_entry_t swap;
+	pgoff_t offset;
 
 	VM_BUG_ON(!*foliop || !xa_is_value(*foliop));
-	swap = radix_to_swp_entry(*foliop);
+	index_entry = radix_to_swp_entry(*foliop);
 	*foliop = NULL;
 
-	if (is_poisoned_swp_entry(swap))
+	if (is_poisoned_swp_entry(index_entry))
 		return -EIO;
 
-	si = get_swap_device(swap);
-	order = shmem_confirm_swap(mapping, index, swap);
+	si = get_swap_device(index_entry);
+	order = shmem_confirm_swap(mapping, index, index_entry);
 	if (unlikely(!si)) {
 		if (order < 0)
 			return -EEXIST;
@@ -2313,13 +2319,15 @@ static int shmem_swapin_folio(struct ino
 		return -EEXIST;
 	}
 
-	/* Look it up and read it in.. */
+	/* @index may points to the middle of a large entry, get the real swap value first */
+	offset = index - round_down(index, 1 << order);
+	swap.val = index_entry.val + offset;
 	folio = swap_cache_get_folio(swap, NULL, 0);
 	if (!folio) {
 		if (data_race(si->flags & SWP_SYNCHRONOUS_IO)) {
 			/* Direct mTHP swapin without swap cache or readahead */
 			folio = shmem_swapin_direct(inode, vma, index,
-						    swap, order, gfp);
+						    index_entry, swap, order, gfp);
 			if (IS_ERR(folio)) {
 				error = PTR_ERR(folio);
 				folio = NULL;
@@ -2341,28 +2349,25 @@ static int shmem_swapin_folio(struct ino
 			count_memcg_event_mm(fault_mm, PGMAJFAULT);
 		}
 	}
+
+	swap_order = folio_order(folio);
+	nr_pages = folio_nr_pages(folio);
+	/* The swap-in should cover both @swap and @index */
+	swap.val = round_down(swap.val, nr_pages);
+	VM_WARN_ON_ONCE(swap.val > index_entry.val + offset);
+	VM_WARN_ON_ONCE(swap.val + nr_pages <= index_entry.val + offset);
+
 	/*
 	 * We need to split an existing large entry if swapin brought in a
 	 * smaller folio due to various of reasons.
-	 *
-	 * And worth noting there is a special case: if there is a smaller
-	 * cached folio that covers @swap, but not @index (it only covers
-	 * first few sub entries of the large entry, but @index points to
-	 * later parts), the swap cache lookup will still see this folio,
-	 * And we need to split the large entry here. Later checks will fail,
-	 * as it can't satisfy the swap requirement, and we will retry
-	 * the swapin from beginning.
 	 */
-	swap_order = folio_order(folio);
+	index = round_down(index, nr_pages);
 	if (order > swap_order) {
-		error = shmem_split_swap_entry(inode, index, swap, gfp);
+		error = shmem_split_swap_entry(inode, index, index_entry, gfp);
 		if (error)
 			goto failed_nolock;
 	}
 
-	index = round_down(index, 1 << swap_order);
-	swap.val = round_down(swap.val, 1 << swap_order);
-
 	/* We have to do this with folio locked to prevent races */
 	folio_lock(folio);
 	if ((!skip_swapcache && !folio_test_swapcache(folio)) ||
@@ -2375,7 +2380,6 @@ static int shmem_swapin_folio(struct ino
 		goto failed;
 	}
 	folio_wait_writeback(folio);
-	nr_pages = folio_nr_pages(folio);
 
 	/*
 	 * Some architectures may have to restore extra metadata to the
_

Patches currently in -mm which might be from kasong@tencent.com are

mm-list_lru-refactor-the-locking-code.patch
mm-shmem-swap-improve-cached-mthp-handling-and-fix-potential-hung.patch
mm-shmem-swap-avoid-redundant-xarray-lookup-during-swapin.patch
mm-shmem-swap-tidy-up-thp-swapin-checks.patch
mm-shmem-swap-clean-up-swap-entry-splitting.patch
mm-shmem-swap-never-use-swap-cache-and-readahead-for-swp_synchronous_io.patch
mm-shmem-swap-fix-major-fault-counting.patch
mm-shmem-swap-avoid-false-positive-swap-cache-lookup.patch


^ permalink raw reply	[flat|nested] 3+ messages in thread

end of thread, other threads:[~2025-06-27 20:16 UTC | newest]

Thread overview: 3+ messages (download: mbox.gz follow: Atom feed
-- links below jump to the message on this page --
2025-06-27 20:16 + mm-shmem-swap-avoid-false-positive-swap-cache-lookup.patch added to mm-new branch Andrew Morton
  -- strict thread matches above, loose matches on Subject: below --
2025-06-20  0:40 Andrew Morton
2025-06-17 23:03 Andrew Morton

This is an external index of several public inboxes,
see mirroring instructions on how to clone and mirror
all data and code used by this external index.