Linux Documentation
 help / color / mirror / Atom feed
From: fujunjie <fujunjie1@qq.com>
To: Andrew Morton <akpm@linux-foundation.org>,
	Chris Li <chrisl@kernel.org>, Kairui Song <kasong@tencent.com>,
	Johannes Weiner <hannes@cmpxchg.org>,
	Nhat Pham <nphamcs@gmail.com>, Yosry Ahmed <yosry@kernel.org>
Cc: linux-mm@kvack.org, linux-kernel@vger.kernel.org,
	linux-doc@vger.kernel.org, Jonathan Corbet <corbet@lwn.net>,
	David Hildenbrand <david@kernel.org>,
	Ryan Roberts <ryan.roberts@arm.com>,
	Barry Song <baohua@kernel.org>,
	Baolin Wang <baolin.wang@linux.alibaba.com>,
	Chengming Zhou <chengming.zhou@linux.dev>,
	Baoquan He <bhe@redhat.com>, Lorenzo Stoakes <ljs@kernel.org>
Subject: [RFC PATCH 5/5] mm: swap: allow zswap-backed large folio swapin
Date: Fri,  8 May 2026 20:20:33 +0000	[thread overview]
Message-ID: <tencent_0CCC08D68D8D053417D633B1A2CA3C878706@qq.com> (raw)
In-Reply-To: <tencent_8B437BE4F586C162950BF71954316C1EDB05@qq.com>

alloc_swap_folio() has been falling back to order-0 in the anonymous
synchronous swapin path whenever zswap was ever enabled, because a large
folio range could contain a mixture of zswap and non-zswap entries and
zswap_load() could not handle large folios.

zswap_load() can now load a range that is fully present in zswap, and
zswap_entry_batch() can identify mixed zswap ranges. Use that check
alongside the existing zeromap and swapcache checks when selecting a large
folio for anonymous swapin, and recheck before inserting a large folio into
the swap cache while holding the swap cluster lock.

With mixed zswap ranges rejected and the insertion-race fallback in place,
remove the blanket zswap_never_enabled() fallback from the anonymous swapin
path so all-zswap and all-disk anonymous ranges can use mTHP swapin. Shmem
keeps its existing zswap fallback and is outside this RFC.

Signed-off-by: fujunjie <fujunjie1@qq.com>
---
 mm/memory.c     | 21 ++++++---------------
 mm/swap_state.c | 23 +++++++++++++++--------
 2 files changed, 21 insertions(+), 23 deletions(-)

diff --git a/mm/memory.c b/mm/memory.c
index 84e3b77b8293..0be249108de1 100644
--- a/mm/memory.c
+++ b/mm/memory.c
@@ -78,6 +78,7 @@
 #include <linux/sched/sysctl.h>
 #include <linux/pgalloc.h>
 #include <linux/uaccess.h>
+#include <linux/zswap.h>
 
 #include <trace/events/kmem.h>
 
@@ -4635,13 +4636,11 @@ static bool can_swapin_thp(struct vm_fault *vmf, pte_t *ptep, int nr_pages)
 	if (swap_pte_batch(ptep, nr_pages, pte) != nr_pages)
 		return false;
 
-	/*
-	 * swap_read_folio() can't handle the case a large folio is hybridly
-	 * from different backends. And they are likely corner cases. Similar
-	 * things might be added once zswap support large folios.
-	 */
+	/* swap_read_folio() can't handle hybrid backend large folios. */
 	if (unlikely(swap_zeromap_batch(entry, nr_pages, NULL) != nr_pages))
 		return false;
+	if (unlikely(zswap_entry_batch(entry, nr_pages, NULL) != nr_pages))
+		return false;
 	if (unlikely(non_swapcache_batch(entry, nr_pages) != nr_pages))
 		return false;
 
@@ -4690,14 +4689,6 @@ static struct folio *alloc_swap_folio(struct vm_fault *vmf)
 	if (unlikely(userfaultfd_armed(vma)))
 		goto fallback;
 
-	/*
-	 * A large swapped out folio could be partially or fully in zswap. We
-	 * lack handling for such cases, so fallback to swapping in order-0
-	 * folio.
-	 */
-	if (!zswap_never_enabled())
-		goto fallback;
-
 	entry = softleaf_from_pte(vmf->orig_pte);
 	/*
 	 * Get a list of all the (large) orders below PMD_ORDER that are enabled
@@ -4772,8 +4763,8 @@ static struct folio *swapin_synchronous_folio(swp_entry_t entry,
 	order = folio_order(folio);
 
 	/*
-	 * folio is charged, so swapin can only fail due to raced swapin and
-	 * return NULL.
+	 * folio is charged, so NULL means the large folio could not be
+	 * inserted and needs order-0 fallback.
 	 */
 	swapcache = swapin_folio(entry, folio);
 	if (swapcache == folio)
diff --git a/mm/swap_state.c b/mm/swap_state.c
index 1415a5c54a43..4e58fad5e5f0 100644
--- a/mm/swap_state.c
+++ b/mm/swap_state.c
@@ -22,6 +22,7 @@
 #include <linux/vmalloc.h>
 #include <linux/huge_mm.h>
 #include <linux/shmem_fs.h>
+#include <linux/zswap.h>
 #include "internal.h"
 #include "swap_table.h"
 #include "swap.h"
@@ -207,6 +208,11 @@ static int swap_cache_add_folio(struct folio *folio, swp_entry_t entry,
 		if (swp_tb_is_shadow(old_tb))
 			shadow = swp_tb_to_shadow(old_tb);
 	} while (++ci_off < ci_end);
+	if (unlikely(folio_test_large(folio) &&
+		     zswap_entry_batch(entry, nr_pages, NULL) != nr_pages)) {
+		err = -EAGAIN;
+		goto failed;
+	}
 	__swap_cache_add_folio(ci, folio, entry);
 	swap_cluster_unlock(ci);
 	if (shadowp)
@@ -460,7 +466,8 @@ void swap_update_readahead(struct folio *folio, struct vm_area_struct *vma,
  *
  * Context: Caller must protect the swap device with reference count or locks.
  * Return: Returns the folio being added on success. Returns the existing folio
- * if @entry is already cached. Returns NULL if raced with swapin or swapoff.
+ * if @entry is already cached. Returns NULL if raced with swapin or swapoff,
+ * or if a large folio fails a backend recheck before insertion.
  */
 static struct folio *__swap_cache_prepare_and_add(swp_entry_t entry,
 						  struct folio *folio,
@@ -483,10 +490,10 @@ static struct folio *__swap_cache_prepare_and_add(swp_entry_t entry,
 
 		/*
 		 * Large order allocation needs special handling on
-		 * race: if a smaller folio exists in cache, swapin needs
-		 * to fallback to order 0, and doing a swap cache lookup
-		 * might return a folio that is irrelevant to the faulting
-		 * entry because @entry is aligned down. Just return NULL.
+		 * race or backend recheck failure: swapin needs to fall back
+		 * to order 0, and doing a swap cache lookup might return a
+		 * folio that is irrelevant to the faulting entry because
+		 * @entry is aligned down. Just return NULL.
 		 */
 		if (ret != -EEXIST || folio_test_large(folio))
 			goto failed;
@@ -567,9 +574,9 @@ struct folio *swap_cache_alloc_folio(swp_entry_t entry, gfp_t gfp_mask,
  * with the folio size.
  *
  * Return: returns pointer to @folio on success. If folio is a large folio
- * and this raced with another swapin, NULL will be returned to allow fallback
- * to order 0. Else, if another folio was already added to the swap cache,
- * return that swap cache folio instead.
+ * and it raced with another swapin or failed a backend recheck, NULL will be
+ * returned to allow fallback to order 0. Else, if another folio was already
+ * added to the swap cache, return that swap cache folio instead.
  */
 struct folio *swapin_folio(swp_entry_t entry, struct folio *folio)
 {
-- 
2.34.1


  parent reply	other threads:[~2026-05-08 20:20 UTC|newest]

Thread overview: 16+ messages / expand[flat|nested]  mbox.gz  Atom feed  top
2026-05-08 20:18 [RFC PATCH 0/5] mm: support zswap-backed anonymous large folio swapin fujunjie
2026-05-08 20:20 ` [RFC PATCH 1/5] mm: zswap: decompress into a folio subpage fujunjie
2026-05-08 20:20 ` [RFC PATCH 2/5] mm: zswap: add a zswap entry batch helper fujunjie
2026-05-08 20:20 ` [RFC PATCH 3/5] mm: zswap: load fully stored large folios fujunjie
2026-05-11 22:38   ` Yosry Ahmed
2026-05-12  8:05     ` Fujunjie
2026-05-08 20:20 ` [RFC PATCH 4/5] mm: swap: fall back to order-0 after large swapin races fujunjie
2026-05-11 13:03   ` David Hildenbrand (Arm)
2026-05-11 14:59     ` Kairui Song
2026-05-12  7:57       ` Fujunjie
2026-05-08 20:20 ` fujunjie [this message]
2026-05-11 22:13 ` [RFC PATCH 0/5] mm: support zswap-backed anonymous large folio swapin Yosry Ahmed
2026-05-12  6:14   ` David Hildenbrand (Arm)
2026-05-12 19:19     ` Yosry Ahmed
2026-05-12  8:02   ` Fujunjie
     [not found] ` <CAEmasaV7ejxqb9-wTT=7xdt+icxj-ZvdSLkSoC6X5i6NMfsKPQ@mail.gmail.com>
2026-05-12  7:46   ` Fujunjie

Reply instructions:

You may reply publicly to this message via plain-text email
using any one of the following methods:

* Save the following mbox file, import it into your mail client,
  and reply-to-all from there: mbox

  Avoid top-posting and favor interleaved quoting:
  https://en.wikipedia.org/wiki/Posting_style#Interleaved_style

* Reply using the --to, --cc, and --in-reply-to
  switches of git-send-email(1):

  git send-email \
    --in-reply-to=tencent_0CCC08D68D8D053417D633B1A2CA3C878706@qq.com \
    --to=fujunjie1@qq.com \
    --cc=akpm@linux-foundation.org \
    --cc=baohua@kernel.org \
    --cc=baolin.wang@linux.alibaba.com \
    --cc=bhe@redhat.com \
    --cc=chengming.zhou@linux.dev \
    --cc=chrisl@kernel.org \
    --cc=corbet@lwn.net \
    --cc=david@kernel.org \
    --cc=hannes@cmpxchg.org \
    --cc=kasong@tencent.com \
    --cc=linux-doc@vger.kernel.org \
    --cc=linux-kernel@vger.kernel.org \
    --cc=linux-mm@kvack.org \
    --cc=ljs@kernel.org \
    --cc=nphamcs@gmail.com \
    --cc=ryan.roberts@arm.com \
    --cc=yosry@kernel.org \
    /path/to/YOUR_REPLY

  https://kernel.org/pub/software/scm/git/docs/git-send-email.html

* If your mail client supports setting the In-Reply-To header
  via mailto: links, try the mailto: link
Be sure your reply has a Subject: header at the top and a blank line before the message body.
This is a public inbox, see mirroring instructions
for how to clone and mirror all data and code used for this inbox