[PATCH 14/28] mm/shmem: never bypass the swap cache for SWP_SYNCHRONOUS_IO

linux-mm.kvack.org archive mirror
 help / color / mirror / Atom feed

From: Kairui Song <ryncsn@gmail.com>
To: linux-mm@kvack.org
Cc: Andrew Morton <akpm@linux-foundation.org>,
	Matthew Wilcox <willy@infradead.org>,
	Hugh Dickins <hughd@google.com>, Chris Li <chrisl@kernel.org>,
	David Hildenbrand <david@redhat.com>,
	Yosry Ahmed <yosryahmed@google.com>,
	"Huang, Ying" <ying.huang@linux.alibaba.com>,
	Nhat Pham <nphamcs@gmail.com>,
	Johannes Weiner <hannes@cmpxchg.org>,
	Baolin Wang <baolin.wang@linux.alibaba.com>,
	Baoquan He <bhe@redhat.com>, Barry Song <baohua@kernel.org>,
	Kalesh Singh <kaleshsingh@google.com>,
	Kemeng Shi <shikemeng@huaweicloud.com>,
	Tim Chen <tim.c.chen@linux.intel.com>,
	Ryan Roberts <ryan.roberts@arm.com>,
	linux-kernel@vger.kernel.org, Kairui Song <kasong@tencent.com>
Subject: [PATCH 14/28] mm/shmem: never bypass the swap cache for SWP_SYNCHRONOUS_IO
Date: Thu, 15 May 2025 04:17:14 +0800	[thread overview]
Message-ID: <20250514201729.48420-15-ryncsn@gmail.com> (raw)
In-Reply-To: <20250514201729.48420-1-ryncsn@gmail.com>

From: Kairui Song <kasong@tencent.com>

Now the overhead of the swap cache is trivial to none, bypassing the
swap cache is no longer a valid optimization.

So remove the cache bypass swap path for simplification. Many helpers
and functions can be dropped now.

Signed-off-by: Kairui Song <kasong@tencent.com>
---
 mm/shmem.c    | 109 ++++++++++++++++++--------------------------------
 mm/swap.h     |   4 --
 mm/swapfile.c |  35 +++++-----------
 3 files changed, 48 insertions(+), 100 deletions(-)

diff --git a/mm/shmem.c b/mm/shmem.c
index da80a8faa39e..e87eff03c08b 100644
--- a/mm/shmem.c
+++ b/mm/shmem.c
@@ -899,7 +899,9 @@ static int shmem_add_to_page_cache(struct folio *folio,
 				   pgoff_t index, void *expected, gfp_t gfp)
 {
 	XA_STATE_ORDER(xas, &mapping->i_pages, index, folio_order(folio));
-	long nr = folio_nr_pages(folio);
+	unsigned long nr = folio_nr_pages(folio);
+	swp_entry_t iter, swap;
+	void *entry;
 
 	VM_BUG_ON_FOLIO(index != round_down(index, nr), folio);
 	VM_BUG_ON_FOLIO(!folio_test_locked(folio), folio);
@@ -912,13 +914,19 @@ static int shmem_add_to_page_cache(struct folio *folio,
 	gfp &= GFP_RECLAIM_MASK;
 	folio_throttle_swaprate(folio, gfp);
 
+	if (expected)
+		swap = iter = radix_to_swp_entry(expected);
+
 	do {
 		xas_lock_irq(&xas);
-		if (expected != xas_find_conflict(&xas)) {
-			xas_set_err(&xas, -EEXIST);
-			goto unlock;
+		xas_for_each_conflict(&xas, entry) {
+			if (!expected || entry != swp_to_radix_entry(iter)) {
+				xas_set_err(&xas, -EEXIST);
+				goto unlock;
+			}
+			iter.val += 1 << xas_get_order(&xas);
 		}
-		if (expected && xas_find_conflict(&xas)) {
+		if (expected && iter.val - nr != swap.val) {
 			xas_set_err(&xas, -EEXIST);
 			goto unlock;
 		}
@@ -1973,14 +1981,12 @@ static struct folio *shmem_alloc_and_add_folio(struct vm_fault *vmf,
 	return ERR_PTR(error);
 }
 
-static struct folio *shmem_swap_alloc_folio(struct inode *inode,
+static struct folio *shmem_swapin_folio_order(struct inode *inode,
 		struct vm_area_struct *vma, pgoff_t index,
 		swp_entry_t entry, int order, gfp_t gfp)
 {
 	struct shmem_inode_info *info = SHMEM_I(inode);
-	struct folio *new;
-	void *shadow;
-	int nr_pages;
+	struct folio *new, *swapcache;
 
 	/*
 	 * We have arrived here because our zones are constrained, so don't
@@ -1995,41 +2001,19 @@ static struct folio *shmem_swap_alloc_folio(struct inode *inode,
 
 	new = shmem_alloc_folio(gfp, order, info, index);
 	if (!new)
-		return ERR_PTR(-ENOMEM);
+		return NULL;
 
-	nr_pages = folio_nr_pages(new);
 	if (mem_cgroup_swapin_charge_folio(new, vma ? vma->vm_mm : NULL,
-					   gfp, entry)) {
+				gfp, entry)) {
 		folio_put(new);
-		return ERR_PTR(-ENOMEM);
+		return NULL;
 	}
 
-	/*
-	 * Prevent parallel swapin from proceeding with the swap cache flag.
-	 *
-	 * Of course there is another possible concurrent scenario as well,
-	 * that is to say, the swap cache flag of a large folio has already
-	 * been set by swapcache_prepare(), while another thread may have
-	 * already split the large swap entry stored in the shmem mapping.
-	 * In this case, shmem_add_to_page_cache() will help identify the
-	 * concurrent swapin and return -EEXIST.
-	 */
-	if (swapcache_prepare(entry, nr_pages)) {
+	swapcache = swapin_entry(entry, new);
+	if (swapcache != new)
 		folio_put(new);
-		return ERR_PTR(-EEXIST);
-	}
 
-	__folio_set_locked(new);
-	__folio_set_swapbacked(new);
-	new->swap = entry;
-
-	memcg1_swapin(entry, nr_pages);
-	shadow = swap_cache_get_shadow(entry);
-	if (shadow)
-		workingset_refault(new, shadow);
-	folio_add_lru(new);
-	swap_read_folio(new, NULL);
-	return new;
+	return swapcache;
 }
 
 /*
@@ -2122,8 +2106,7 @@ static int shmem_replace_folio(struct folio **foliop, gfp_t gfp,
 }
 
 static void shmem_set_folio_swapin_error(struct inode *inode, pgoff_t index,
-					 struct folio *folio, swp_entry_t swap,
-					 bool skip_swapcache)
+					 struct folio *folio, swp_entry_t swap)
 {
 	struct address_space *mapping = inode->i_mapping;
 	swp_entry_t swapin_error;
@@ -2139,8 +2122,7 @@ static void shmem_set_folio_swapin_error(struct inode *inode, pgoff_t index,
 
 	nr_pages = folio_nr_pages(folio);
 	folio_wait_writeback(folio);
-	if (!skip_swapcache)
-		delete_from_swap_cache(folio);
+	delete_from_swap_cache(folio);
 	/*
 	 * Don't treat swapin error folio as alloced. Otherwise inode->i_blocks
 	 * won't be 0 when inode is released and thus trigger WARN_ON(i_blocks)
@@ -2241,7 +2223,6 @@ static int shmem_swapin_folio(struct inode *inode, pgoff_t index,
 	struct shmem_inode_info *info = SHMEM_I(inode);
 	struct swap_info_struct *si;
 	struct folio *folio = NULL;
-	bool skip_swapcache = false;
 	swp_entry_t swap;
 	int error, nr_pages, order, split_order;
 
@@ -2283,25 +2264,16 @@ static int shmem_swapin_folio(struct inode *inode, pgoff_t index,
 				  !zswap_never_enabled()))
 			fallback_order0 = true;
 
-		/* Skip swapcache for synchronous device. */
+		/* Try mTHP swapin for synchronous device. */
 		if (!fallback_order0 && data_race(si->flags & SWP_SYNCHRONOUS_IO)) {
-			folio = shmem_swap_alloc_folio(inode, vma, index, swap, order, gfp);
-			if (!IS_ERR(folio)) {
-				skip_swapcache = true;
+			folio = shmem_swapin_folio_order(inode, vma, index, swap, order, gfp);
+			if (folio)
 				goto alloced;
-			}
-
-			/*
-			 * Fallback to swapin order-0 folio unless the swap entry
-			 * already exists.
-			 */
-			error = PTR_ERR(folio);
-			folio = NULL;
-			if (error == -EEXIST)
-				goto failed;
 		}
 
 		/*
+		 * Fallback to swapin order-0 folio.
+		 *
 		 * Now swap device can only swap in order 0 folio, then we
 		 * should split the large swap entry stored in the pagecache
 		 * if necessary.
@@ -2338,13 +2310,15 @@ static int shmem_swapin_folio(struct inode *inode, pgoff_t index,
 		split_order = shmem_split_large_entry(inode, index, swap, gfp);
 		if (split_order < 0) {
 			error = split_order;
+			folio_put(folio);
+			folio = NULL;
 			goto failed;
 		}
 	}
 alloced:
 	/* We have to do this with folio locked to prevent races */
 	folio_lock(folio);
-	if (!skip_swapcache && !folio_swap_contains(folio, swap)) {
+	if (!folio_swap_contains(folio, swap)) {
 		error = -EEXIST;
 		goto unlock;
 	}
@@ -2353,12 +2327,15 @@ static int shmem_swapin_folio(struct inode *inode, pgoff_t index,
 	index = round_down(index, nr_pages);
 	swap = swp_entry(swp_type(swap), round_down(swp_offset(swap), nr_pages));
 
-	if (folio_order(folio) != shmem_check_swap_entry(mapping, index, swap)) {
+	/*
+	 * Swap must go through swap cache layer, only the split may happen
+	 * without locking the swap cache.
+	 */
+	if (folio_order(folio) < shmem_check_swap_entry(mapping, index, swap)) {
 		error = -EEXIST;
 		goto unlock;
 	}
-	if (!skip_swapcache)
-		swap_update_readahead(folio, NULL, 0);
+	swap_update_readahead(folio, NULL, 0);
 	if (!folio_test_uptodate(folio)) {
 		error = -EIO;
 		goto failed;
@@ -2387,12 +2364,7 @@ static int shmem_swapin_folio(struct inode *inode, pgoff_t index,
 	if (sgp == SGP_WRITE)
 		folio_mark_accessed(folio);
 
-	if (skip_swapcache) {
-		folio->swap.val = 0;
-		swapcache_clear(si, swap, nr_pages);
-	} else {
-		delete_from_swap_cache(folio);
-	}
+	delete_from_swap_cache(folio);
 	folio_mark_dirty(folio);
 	swap_free_nr(swap, nr_pages);
 	put_swap_device(si);
@@ -2403,11 +2375,8 @@ static int shmem_swapin_folio(struct inode *inode, pgoff_t index,
 	if (shmem_check_swap_entry(mapping, index, swap) < 0)
 		error = -EEXIST;
 	if (error == -EIO)
-		shmem_set_folio_swapin_error(inode, index, folio, swap,
-					     skip_swapcache);
+		shmem_set_folio_swapin_error(inode, index, folio, swap);
 unlock:
-	if (skip_swapcache)
-		swapcache_clear(si, swap, folio_nr_pages(folio));
 	if (folio) {
 		folio_unlock(folio);
 		folio_put(folio);
diff --git a/mm/swap.h b/mm/swap.h
index aab6bf9c3a8a..cad24a3abda8 100644
--- a/mm/swap.h
+++ b/mm/swap.h
@@ -319,10 +319,6 @@ static inline int swap_writepage(struct page *p, struct writeback_control *wbc)
 	return 0;
 }
 
-static inline void swapcache_clear(struct swap_info_struct *si, swp_entry_t entry, int nr)
-{
-}
-
 static inline struct folio *swap_cache_get_folio(swp_entry_t entry)
 {
 	return NULL;
diff --git a/mm/swapfile.c b/mm/swapfile.c
index 62af67b6f7c2..d3abd2149f8e 100644
--- a/mm/swapfile.c
+++ b/mm/swapfile.c
@@ -1430,22 +1430,6 @@ struct swap_info_struct *get_swap_device(swp_entry_t entry)
 	return NULL;
 }
 
-static void swap_entries_put_cache(struct swap_info_struct *si,
-				   swp_entry_t entry, int nr)
-{
-	unsigned long offset = swp_offset(entry);
-	struct swap_cluster_info *ci;
-
-	ci = swap_lock_cluster(si, offset);
-	if (swap_only_has_cache(si, offset, nr)) {
-		swap_entries_free(si, ci, entry, nr);
-	} else {
-		for (int i = 0; i < nr; i++, entry.val++)
-			swap_entry_put_locked(si, ci, entry, SWAP_HAS_CACHE);
-	}
-	swap_unlock_cluster(ci);
-}
-
 static bool swap_entries_put_map(struct swap_info_struct *si,
 				 swp_entry_t entry, int nr)
 {
@@ -1578,13 +1562,21 @@ void swap_free_nr(swp_entry_t entry, int nr_pages)
 void put_swap_folio(struct folio *folio, swp_entry_t entry)
 {
 	struct swap_info_struct *si;
+	struct swap_cluster_info *ci;
+	unsigned long offset = swp_offset(entry);
 	int size = 1 << swap_entry_order(folio_order(folio));
 
 	si = _swap_info_get(entry);
 	if (!si)
 		return;
 
-	swap_entries_put_cache(si, entry, size);
+	ci = swap_lock_cluster(si, offset);
+	if (swap_only_has_cache(si, offset, size))
+		swap_entries_free(si, ci, entry, size);
+	else
+		for (int i = 0; i < size; i++, entry.val++)
+			swap_entry_put_locked(si, ci, entry, SWAP_HAS_CACHE);
+	swap_unlock_cluster(ci);
 }
 
 int __swap_count(swp_entry_t entry)
@@ -3615,15 +3607,6 @@ int swapcache_prepare(swp_entry_t entry, int nr)
 	return __swap_duplicate(entry, SWAP_HAS_CACHE, nr);
 }
 
-/*
- * Caller should ensure entries belong to the same folio so
- * the entries won't span cross cluster boundary.
- */
-void swapcache_clear(struct swap_info_struct *si, swp_entry_t entry, int nr)
-{
-	swap_entries_put_cache(si, entry, nr);
-}
-
 /*
  * add_swap_count_continuation - called when a swap count is duplicated
  * beyond SWAP_MAP_MAX, it allocates a new page and links that to the entry's
-- 
2.49.0

next prev parent reply	other threads:[~2025-05-14 20:18 UTC|newest]

Thread overview: 56+ messages / expand[flat|nested]  mbox.gz  Atom feed  top
2025-05-14 20:17 [PATCH 00/28] mm, swap: introduce swap table Kairui Song
2025-05-14 20:17 ` [PATCH 01/28] mm, swap: don't scan every fragment cluster Kairui Song
2025-05-14 20:17 ` [PATCH 02/28] mm, swap: consolidate the helper for mincore Kairui Song
2025-05-14 20:17 ` [PATCH 03/28] mm/shmem, swap: remove SWAP_MAP_SHMEM Kairui Song
2025-05-14 20:17 ` [PATCH 04/28] mm, swap: split readahead update out of swap cache lookup Kairui Song
2025-05-14 20:17 ` [PATCH 05/28] mm, swap: sanitize swap cache lookup convention Kairui Song
2025-05-19  4:38   ` Barry Song
2025-05-20  3:31     ` Kairui Song
2025-05-20  4:41       ` Barry Song
2025-05-20 19:09         ` Kairui Song
2025-05-20 22:33           ` Barry Song
2025-05-21  2:45             ` Kairui Song
2025-05-21  3:24               ` Barry Song
2025-05-23  2:29               ` Barry Song
2025-05-23 20:01                 ` Kairui Song
2025-05-27  7:58                   ` Barry Song
2025-05-27 15:11                     ` Kairui Song
2025-05-30  8:49                       ` Kairui Song
2025-05-30 19:24                         ` Kairui Song
2025-05-14 20:17 ` [PATCH 06/28] mm, swap: rearrange swap cluster definition and helpers Kairui Song
2025-05-19  6:26   ` Barry Song
2025-05-20  3:50     ` Kairui Song
2025-05-14 20:17 ` [PATCH 07/28] mm, swap: tidy up swap device and cluster info helpers Kairui Song
2025-05-14 20:17 ` [PATCH 08/28] mm, swap: use swap table for the swap cache and switch API Kairui Song
2025-05-14 20:17 ` [PATCH 09/28] mm/swap: rename __read_swap_cache_async to __swapin_cache_alloc Kairui Song
2025-05-14 20:17 ` [PATCH 10/28] mm, swap: add a swap helper for bypassing only read ahead Kairui Song
2025-05-14 20:17 ` [PATCH 11/28] mm, swap: clean up and consolidate helper for mTHP swapin check Kairui Song
2025-05-15  9:31   ` Klara Modin
2025-05-15  9:39     ` Kairui Song
2025-05-19  7:08   ` Barry Song
2025-05-19 11:09     ` Kairui Song
2025-05-19 11:57       ` Barry Song
2025-05-14 20:17 ` [PATCH 12/28] mm, swap: never bypass the swap cache for SWP_SYNCHRONOUS_IO Kairui Song
2025-05-14 20:17 ` [PATCH 13/28] mm/shmem, swap: avoid redundant Xarray lookup during swapin Kairui Song
2025-05-14 20:17 ` Kairui Song [this message]
2025-05-14 20:17 ` [PATCH 15/28] mm, swap: split locked entry freeing into a standalone helper Kairui Song
2025-05-14 20:17 ` [PATCH 16/28] mm, swap: use swap cache as the swap in synchronize layer Kairui Song
2025-05-14 20:17 ` [PATCH 17/28] mm, swap: sanitize swap entry management workflow Kairui Song
2025-05-14 20:17 ` [PATCH 18/28] mm, swap: rename and introduce folio_free_swap_cache Kairui Song
2025-05-14 20:17 ` [PATCH 19/28] mm, swap: clean up and improve swap entries batch freeing Kairui Song
2025-05-14 20:17 ` [PATCH 20/28] mm, swap: check swap table directly for checking cache Kairui Song
2025-06-19 10:38   ` Baoquan He
2025-06-19 10:50     ` Kairui Song
2025-06-20  8:04       ` Baoquan He
2025-05-14 20:17 ` [PATCH 21/28] mm, swap: add folio to swap cache directly on allocation Kairui Song
2025-05-14 20:17 ` [PATCH 22/28] mm, swap: drop the SWAP_HAS_CACHE flag Kairui Song
2025-05-14 20:17 ` [PATCH 23/28] mm, swap: remove no longer needed _swap_info_get Kairui Song
2025-05-14 20:17 ` [PATCH 24/28] mm, swap: implement helpers for reserving data in swap table Kairui Song
2025-05-15  9:40   ` Klara Modin
2025-05-16  2:35     ` Kairui Song
2025-05-14 20:17 ` [PATCH 25/28] mm/workingset: leave highest 8 bits empty for anon shadow Kairui Song
2025-05-14 20:17 ` [PATCH 26/28] mm, swap: minor clean up for swapon Kairui Song
2025-05-14 20:17 ` [PATCH 27/28] mm, swap: use swap table to track swap count Kairui Song
2025-05-14 20:17 ` [PATCH 28/28] mm, swap: implement dynamic allocation of swap table Kairui Song
2025-05-21 18:36   ` Nhat Pham
2025-05-22  4:13     ` Kairui Song

find likely ancestor, descendant, or conflicting patches for this message:
( dfblob:da80a8faa39 dfblob:e87eff03c08 dfblob:aab6bf9c3a8
dfblob:cad24a3abda dfblob:62af67b6f7c dfblob:d3abd2149f8 )
 OR (
bs:"[PATCH 14/28] mm/shmem: never bypass the swap cache for SWP_SYNCHRONOUS_IO" )
	(help)

Reply instructions:

You may reply publicly to this message via plain-text email
using any one of the following methods:

* Save the following mbox file, import it into your mail client,
  and reply-to-all from there: mbox

  Avoid top-posting and favor interleaved quoting:
  https://en.wikipedia.org/wiki/Posting_style#Interleaved_style

* Reply using the --to, --cc, and --in-reply-to
  switches of git-send-email(1):

  git send-email \
    --in-reply-to=20250514201729.48420-15-ryncsn@gmail.com \
    --to=ryncsn@gmail.com \
    --cc=akpm@linux-foundation.org \
    --cc=baohua@kernel.org \
    --cc=baolin.wang@linux.alibaba.com \
    --cc=bhe@redhat.com \
    --cc=chrisl@kernel.org \
    --cc=david@redhat.com \
    --cc=hannes@cmpxchg.org \
    --cc=hughd@google.com \
    --cc=kaleshsingh@google.com \
    --cc=kasong@tencent.com \
    --cc=linux-kernel@vger.kernel.org \
    --cc=linux-mm@kvack.org \
    --cc=nphamcs@gmail.com \
    --cc=ryan.roberts@arm.com \
    --cc=shikemeng@huaweicloud.com \
    --cc=tim.c.chen@linux.intel.com \
    --cc=willy@infradead.org \
    --cc=ying.huang@linux.alibaba.com \
    --cc=yosryahmed@google.com \
    /path/to/YOUR_REPLY

  https://kernel.org/pub/software/scm/git/docs/git-send-email.html

* If your mail client supports setting the In-Reply-To header
  via mailto: links, try the mailto: link

Be sure your reply has a Subject: header at the top and a blank line before the message body.

This is a public inbox, see mirroring instructions
for how to clone and mirror all data and code used for this inbox;
as well as URLs for NNTP newsgroup(s).