From: Kairui Song <ryncsn@gmail.com>
To: linux-mm@kvack.org
Cc: Andrew Morton <akpm@linux-foundation.org>,
Hugh Dickins <hughd@google.com>,
Baolin Wang <baolin.wang@linux.alibaba.com>,
Matthew Wilcox <willy@infradead.org>,
Kemeng Shi <shikemeng@huaweicloud.com>,
Chris Li <chrisl@kernel.org>, Nhat Pham <nphamcs@gmail.com>,
Baoquan He <bhe@redhat.com>, Barry Song <baohua@kernel.org>,
linux-kernel@vger.kernel.org, Kairui Song <kasong@tencent.com>
Subject: [PATCH v3 5/7] mm/shmem, swap: never use swap cache and readahead for SWP_SYNCHRONOUS_IO
Date: Fri, 27 Jun 2025 14:20:18 +0800 [thread overview]
Message-ID: <20250627062020.534-6-ryncsn@gmail.com> (raw)
In-Reply-To: <20250627062020.534-1-ryncsn@gmail.com>
From: Kairui Song <kasong@tencent.com>
Currently if THP swapin failed due to reasons like partially conflicting
swap cache or ZSWAP enabled, it will fallback to cached swapin.
Right now the swap cache has a non-trivial overhead, and readahead is
not helpful for SWP_SYNCHRONOUS_IO devices, so we should always skip
the readahead and swap cache even if the swapin falls back to order 0.
So handle the fallback logic without falling back to the cached read.
Also slightly tweak the behavior if the WARN_ON is triggered (shmem
mapping is corrupted or buggy code) as a side effect, just return
with -EINVAL. This should be OK as things are already very wrong
beyond recovery at that point.
Signed-off-by: Kairui Song <kasong@tencent.com>
---
mm/shmem.c | 68 ++++++++++++++++++++++++++++++------------------------
1 file changed, 38 insertions(+), 30 deletions(-)
diff --git a/mm/shmem.c b/mm/shmem.c
index 5be9c905396e..5f2641fd1be7 100644
--- a/mm/shmem.c
+++ b/mm/shmem.c
@@ -1975,13 +1975,15 @@ static struct folio *shmem_alloc_and_add_folio(struct vm_fault *vmf,
return ERR_PTR(error);
}
-static struct folio *shmem_swap_alloc_folio(struct inode *inode,
+static struct folio *shmem_swapin_direct(struct inode *inode,
struct vm_area_struct *vma, pgoff_t index,
swp_entry_t entry, int order, gfp_t gfp)
{
struct shmem_inode_info *info = SHMEM_I(inode);
int nr_pages = 1 << order;
struct folio *new;
+ pgoff_t offset;
+ gfp_t swap_gfp;
void *shadow;
/*
@@ -1989,6 +1991,7 @@ static struct folio *shmem_swap_alloc_folio(struct inode *inode,
* limit chance of success with further cpuset and node constraints.
*/
gfp &= ~GFP_CONSTRAINT_MASK;
+ swap_gfp = gfp;
if (!IS_ENABLED(CONFIG_TRANSPARENT_HUGEPAGE)) {
if (WARN_ON_ONCE(order))
return ERR_PTR(-EINVAL);
@@ -2003,20 +2006,23 @@ static struct folio *shmem_swap_alloc_folio(struct inode *inode,
if ((vma && unlikely(userfaultfd_armed(vma))) ||
!zswap_never_enabled() ||
non_swapcache_batch(entry, nr_pages) != nr_pages) {
- return ERR_PTR(-EINVAL);
+ goto fallback;
} else {
- gfp = limit_gfp_mask(vma_thp_gfp_mask(vma), gfp);
+ swap_gfp = limit_gfp_mask(vma_thp_gfp_mask(vma), gfp);
}
}
-
- new = shmem_alloc_folio(gfp, order, info, index);
- if (!new)
- return ERR_PTR(-ENOMEM);
+retry:
+ new = shmem_alloc_folio(swap_gfp, order, info, index);
+ if (!new) {
+ new = ERR_PTR(-ENOMEM);
+ goto fallback;
+ }
if (mem_cgroup_swapin_charge_folio(new, vma ? vma->vm_mm : NULL,
- gfp, entry)) {
+ swap_gfp, entry)) {
folio_put(new);
- return ERR_PTR(-ENOMEM);
+ new = ERR_PTR(-ENOMEM);
+ goto fallback;
}
/*
@@ -2045,6 +2051,17 @@ static struct folio *shmem_swap_alloc_folio(struct inode *inode,
folio_add_lru(new);
swap_read_folio(new, NULL);
return new;
+fallback:
+ /* Order 0 swapin failed, nothing to fallback to, abort */
+ if (!order)
+ return new;
+ /* High order swapin failed, fallback to order 0 and retry */
+ order = 0;
+ nr_pages = 1;
+ swap_gfp = gfp;
+ offset = index - round_down(index, nr_pages);
+ entry = swp_entry(swp_type(entry), swp_offset(entry) + offset);
+ goto retry;
}
/*
@@ -2243,7 +2260,6 @@ static int shmem_split_swap_entry(struct inode *inode, pgoff_t index,
cur_order = split_order;
split_order = xas_try_split_min_order(split_order);
}
-
unlock:
xas_unlock_irq(&xas);
@@ -2306,34 +2322,26 @@ static int shmem_swapin_folio(struct inode *inode, pgoff_t index,
count_vm_event(PGMAJFAULT);
count_memcg_event_mm(fault_mm, PGMAJFAULT);
}
-
- /* Skip swapcache for synchronous device. */
if (data_race(si->flags & SWP_SYNCHRONOUS_IO)) {
- folio = shmem_swap_alloc_folio(inode, vma, index, swap, order, gfp);
- if (!IS_ERR(folio)) {
+ /* Direct mTHP swapin without swap cache or readahead */
+ folio = shmem_swapin_direct(inode, vma, index,
+ swap, order, gfp);
+ if (IS_ERR(folio)) {
+ error = PTR_ERR(folio);
+ folio = NULL;
+ } else {
skip_swapcache = true;
- goto alloced;
}
-
+ } else {
/*
- * Fallback to swapin order-0 folio unless the swap entry
- * already exists.
+ * Order 0 swapin using swap cache and readahead, it
+ * may return order > 0 folio due to raced swap cache
*/
- error = PTR_ERR(folio);
- folio = NULL;
- if (error == -EEXIST)
- goto failed;
+ folio = shmem_swapin_cluster(swap, gfp, info, index);
}
-
- /* Here we actually start the io */
- folio = shmem_swapin_cluster(swap, gfp, info, index);
- if (!folio) {
- error = -ENOMEM;
+ if (!folio)
goto failed;
- }
}
-
-alloced:
/*
* We need to split an existing large entry if swapin brought in a
* smaller folio due to various of reasons.
--
2.50.0
next prev parent reply other threads:[~2025-06-27 6:23 UTC|newest]
Thread overview: 23+ messages / expand[flat|nested] mbox.gz Atom feed top
2025-06-27 6:20 [PATCH v3 0/7] mm/shmem, swap: bugfix and improvement of mTHP swap in Kairui Song
2025-06-27 6:20 ` [PATCH v3 1/7] mm/shmem, swap: improve cached mTHP handling and fix potential hung Kairui Song
2025-06-30 3:44 ` Baolin Wang
2025-06-27 6:20 ` [PATCH v3 2/7] mm/shmem, swap: avoid redundant Xarray lookup during swapin Kairui Song
2025-06-27 6:20 ` [PATCH v3 3/7] mm/shmem, swap: tidy up THP swapin checks Kairui Song
2025-06-30 4:47 ` Baolin Wang
2025-06-27 6:20 ` [PATCH v3 4/7] mm/shmem, swap: clean up swap entry splitting Kairui Song
2025-06-30 6:34 ` Baolin Wang
2025-06-30 9:16 ` Kairui Song
2025-06-30 9:53 ` Baolin Wang
2025-06-30 10:06 ` Kairui Song
2025-06-30 11:59 ` Baolin Wang
2025-06-30 18:19 ` Kairui Song
2025-07-01 1:57 ` Baolin Wang
2025-07-01 18:49 ` Kairui Song
2025-07-02 2:33 ` Baolin Wang
2025-06-27 6:20 ` Kairui Song [this message]
2025-06-30 7:24 ` [PATCH v3 5/7] mm/shmem, swap: never use swap cache and readahead for SWP_SYNCHRONOUS_IO Baolin Wang
2025-06-27 6:20 ` [PATCH v3 6/7] mm/shmem, swap: fix major fault counting Kairui Song
2025-06-30 7:05 ` Baolin Wang
2025-06-27 6:20 ` [PATCH v3 7/7] mm/shmem, swap: avoid false positive swap cache lookup Kairui Song
2025-06-30 7:21 ` Baolin Wang
2025-06-30 9:17 ` Kairui Song
Reply instructions:
You may reply publicly to this message via plain-text email
using any one of the following methods:
* Save the following mbox file, import it into your mail client,
and reply-to-all from there: mbox
Avoid top-posting and favor interleaved quoting:
https://en.wikipedia.org/wiki/Posting_style#Interleaved_style
* Reply using the --to, --cc, and --in-reply-to
switches of git-send-email(1):
git send-email \
--in-reply-to=20250627062020.534-6-ryncsn@gmail.com \
--to=ryncsn@gmail.com \
--cc=akpm@linux-foundation.org \
--cc=baohua@kernel.org \
--cc=baolin.wang@linux.alibaba.com \
--cc=bhe@redhat.com \
--cc=chrisl@kernel.org \
--cc=hughd@google.com \
--cc=kasong@tencent.com \
--cc=linux-kernel@vger.kernel.org \
--cc=linux-mm@kvack.org \
--cc=nphamcs@gmail.com \
--cc=shikemeng@huaweicloud.com \
--cc=willy@infradead.org \
/path/to/YOUR_REPLY
https://kernel.org/pub/software/scm/git/docs/git-send-email.html
* If your mail client supports setting the In-Reply-To header
via mailto: links, try the mailto: link
Be sure your reply has a Subject: header at the top and a blank line
before the message body.
This is a public inbox, see mirroring instructions
for how to clone and mirror all data and code used for this inbox;
as well as URLs for NNTP newsgroup(s).