[PATCH 3/7] mm: shmem: avoid missing entries in shmem_undo_range() when entries was splited concurrently

linux-fsdevel.vger.kernel.org archive mirror
 help / color / mirror / Atom feed

From: Kemeng Shi <shikemeng@huaweicloud.com>
To: hughd@google.com, baolin.wang@linux.alibaba.com,
	willy@infradead.org, akpm@linux-foundation.org
Cc: linux-mm@kvack.org, linux-kernel@vger.kernel.org,
	linux-fsdevel@vger.kernel.org
Subject: [PATCH 3/7] mm: shmem: avoid missing entries in shmem_undo_range() when entries was splited concurrently
Date: Fri,  6 Jun 2025 06:10:33 +0800	[thread overview]
Message-ID: <20250605221037.7872-4-shikemeng@huaweicloud.com> (raw)
In-Reply-To: <20250605221037.7872-1-shikemeng@huaweicloud.com>

If large swap entry or large folio entry returned from find_get_entries()
is splited before it is truncated, only the first splited entry will be
truncated, leaving the remaining splited entries un-truncated.
To address this issue, we introduce a new helper function
shmem_find_get_entries() which is similar to find_get_entries() but it will
also return order of found entries. Then we can detect entry splitting
after initial search by comparing current entry order with order returned
from shmem_find_get_entries() and retry finding entries if the split is
detectted to fix the issue.
The large swap entry related race was introduced in 12885cbe88dd ("mm:
shmem: split large entry if the swapin folio is not large"). The large
folio related race seems a long-standing issue which may be related to
conversion to xarray, conversion to folio and other changes. As a result,
it's hard to track down the specific commit that directly introduced this
issue.

Signed-off-by: Kemeng Shi <shikemeng@huaweicloud.com>
---
 mm/filemap.c  |  2 +-
 mm/internal.h |  2 ++
 mm/shmem.c    | 81 ++++++++++++++++++++++++++++++++++++++++++---------
 3 files changed, 70 insertions(+), 15 deletions(-)

diff --git a/mm/filemap.c b/mm/filemap.c
index 7b90cbeb4a1a..672844b94d3a 100644
--- a/mm/filemap.c
+++ b/mm/filemap.c
@@ -2015,7 +2015,7 @@ struct folio *__filemap_get_folio(struct address_space *mapping, pgoff_t index,
 }
 EXPORT_SYMBOL(__filemap_get_folio);
 
-static inline struct folio *find_get_entry(struct xa_state *xas, pgoff_t max,
+struct folio *find_get_entry(struct xa_state *xas, pgoff_t max,
 		xa_mark_t mark)
 {
 	struct folio *folio;
diff --git a/mm/internal.h b/mm/internal.h
index 6b8ed2017743..9573b3a9e8c0 100644
--- a/mm/internal.h
+++ b/mm/internal.h
@@ -446,6 +446,8 @@ static inline void force_page_cache_readahead(struct address_space *mapping,
 	force_page_cache_ra(&ractl, nr_to_read);
 }
 
+struct folio *find_get_entry(struct xa_state *xas, pgoff_t max,
+		xa_mark_t mark);
 unsigned find_lock_entries(struct address_space *mapping, pgoff_t *start,
 		pgoff_t end, struct folio_batch *fbatch, pgoff_t *indices);
 unsigned find_get_entries(struct address_space *mapping, pgoff_t *start,
diff --git a/mm/shmem.c b/mm/shmem.c
index f1062910a4de..2349673b239b 100644
--- a/mm/shmem.c
+++ b/mm/shmem.c
@@ -949,18 +949,29 @@ static void shmem_delete_from_page_cache(struct folio *folio, void *radswap)
  * the number of pages being freed. 0 means entry not found in XArray (0 pages
  * being freed).
  */
-static long shmem_free_swap(struct address_space *mapping,
-			    pgoff_t index, void *radswap)
+static long shmem_free_swap(struct address_space *mapping, pgoff_t index,
+			    int order, void *radswap)
 {
-	int order = xa_get_order(&mapping->i_pages, index);
+	int old_order;
 	void *old;
 
-	old = xa_cmpxchg_irq(&mapping->i_pages, index, radswap, NULL, 0);
-	if (old != radswap)
+	xa_lock_irq(&mapping->i_pages);
+	old_order = xa_get_order(&mapping->i_pages, index);
+	/* free swap anyway if input order is -1 */
+	if (order != -1 && old_order != order) {
+		xa_unlock_irq(&mapping->i_pages);
+		return 0;
+	}
+
+	old = __xa_cmpxchg(&mapping->i_pages, index, radswap, NULL, 0);
+	if (old != radswap) {
+		xa_unlock_irq(&mapping->i_pages);
 		return 0;
-	free_swap_and_cache_nr(radix_to_swp_entry(radswap), 1 << order);
+	}
+	xa_unlock_irq(&mapping->i_pages);
 
-	return 1 << order;
+	free_swap_and_cache_nr(radix_to_swp_entry(radswap), 1 << old_order);
+	return 1 << old_order;
 }
 
 /*
@@ -1077,6 +1088,39 @@ static struct folio *shmem_get_partial_folio(struct inode *inode, pgoff_t index)
 	return folio;
 }
 
+/*
+ * Similar to find_get_entries(), but will return order of found entries
+ */
+static unsigned shmem_find_get_entries(struct address_space *mapping,
+		pgoff_t *start, pgoff_t end, struct folio_batch *fbatch,
+		pgoff_t *indices, int *orders)
+{
+	XA_STATE(xas, &mapping->i_pages, *start);
+	struct folio *folio;
+
+	rcu_read_lock();
+	while ((folio = find_get_entry(&xas, end, XA_PRESENT)) != NULL) {
+		indices[fbatch->nr] = xas.xa_index;
+		if (!xa_is_value(folio))
+			orders[fbatch->nr] = folio_order(folio);
+		else
+			orders[fbatch->nr] = xas_get_order(&xas);
+		if (!folio_batch_add(fbatch, folio))
+			break;
+	}
+
+	if (folio_batch_count(fbatch)) {
+		unsigned long nr;
+		int idx = folio_batch_count(fbatch) - 1;
+
+		nr = 1 << orders[idx];
+		*start = round_down(indices[idx] + nr, nr);
+	}
+	rcu_read_unlock();
+
+	return folio_batch_count(fbatch);
+}
+
 /*
  * Remove range of pages and swap entries from page cache, and free them.
  * If !unfalloc, truncate or punch hole; if unfalloc, undo failed fallocate.
@@ -1090,6 +1134,7 @@ static void shmem_undo_range(struct inode *inode, loff_t lstart, loff_t lend,
 	pgoff_t end = (lend + 1) >> PAGE_SHIFT;
 	struct folio_batch fbatch;
 	pgoff_t indices[PAGEVEC_SIZE];
+	int orders[PAGEVEC_SIZE];
 	struct folio *folio;
 	bool same_folio;
 	long nr_swaps_freed = 0;
@@ -1113,7 +1158,7 @@ static void shmem_undo_range(struct inode *inode, loff_t lstart, loff_t lend,
 				if (unfalloc)
 					continue;
 				nr_swaps_freed += shmem_free_swap(mapping,
-							indices[i], folio);
+						indices[i], -1, folio);
 				continue;
 			}
 
@@ -1166,8 +1211,8 @@ static void shmem_undo_range(struct inode *inode, loff_t lstart, loff_t lend,
 	while (index < end) {
 		cond_resched();
 
-		if (!find_get_entries(mapping, &index, end - 1, &fbatch,
-				indices)) {
+		if (!shmem_find_get_entries(mapping, &index, end - 1, &fbatch,
+				indices, orders)) {
 			/* If all gone or hole-punch or unfalloc, we're done */
 			if (index == start || end != -1)
 				break;
@@ -1183,9 +1228,13 @@ static void shmem_undo_range(struct inode *inode, loff_t lstart, loff_t lend,
 
 				if (unfalloc)
 					continue;
-				swaps_freed = shmem_free_swap(mapping, indices[i], folio);
+				swaps_freed = shmem_free_swap(mapping,
+					indices[i], orders[i], folio);
+				/*
+				 * Swap was replaced by page or was
+				 * splited: retry
+				 */
 				if (!swaps_freed) {
-					/* Swap was replaced by page: retry */
 					index = indices[i];
 					break;
 				}
@@ -1196,8 +1245,12 @@ static void shmem_undo_range(struct inode *inode, loff_t lstart, loff_t lend,
 			folio_lock(folio);
 
 			if (!unfalloc || !folio_test_uptodate(folio)) {
-				if (folio_mapping(folio) != mapping) {
-					/* Page was replaced by swap: retry */
+				/*
+				 * Page was replaced by swap or was
+				 * splited: retry
+				 */
+				if (folio_mapping(folio) != mapping ||
+				    folio_order(folio) != orders[i]) {
 					folio_unlock(folio);
 					index = indices[i];
 					break;
-- 
2.30.0

next prev parent reply	other threads:[~2025-06-05 13:17 UTC|newest]

Thread overview: 24+ messages / expand[flat|nested]  mbox.gz  Atom feed  top
2025-06-05 22:10 [PATCH 0/7] Some random fixes and cleanups to shmem Kemeng Shi
2025-06-05 20:02 ` Andrew Morton
2025-06-05 22:10 ` [PATCH 1/7] mm: shmem: correctly pass alloced parameter to shmem_recalc_inode() to avoid WARN_ON() Kemeng Shi
2025-06-05 19:57   ` Andrew Morton
2025-06-06  1:11     ` Kemeng Shi
2025-06-06  1:28       ` Andrew Morton
2025-06-06  2:29         ` Kemeng Shi
2025-06-06  1:31       ` Kemeng Shi
2025-06-07  6:11   ` Baolin Wang
2025-06-09  0:46     ` Kemeng Shi
2025-06-10  1:02       ` Kemeng Shi
2025-06-11  7:29         ` Baolin Wang
2025-06-11  8:38           ` Kemeng Shi
2025-06-05 22:10 ` [PATCH 2/7] mm: shmem: avoid setting error on splited entries in shmem_set_folio_swapin_error() Kemeng Shi
2025-06-07  6:20   ` Baolin Wang
2025-06-09  1:19     ` Kemeng Shi
2025-06-11  7:41       ` Baolin Wang
2025-06-11  9:11         ` Kemeng Shi
2025-06-13  6:56           ` Baolin Wang
2025-06-05 22:10 ` Kemeng Shi [this message]
2025-06-05 22:10 ` [PATCH 4/7] mm: shmem: handle special case of shmem_recalc_inode() in it's caller Kemeng Shi
2025-06-05 22:10 ` [PATCH 5/7] mm: shmem: wrap additional shmem quota related code with CONFIG_TMPFS_QUOTA Kemeng Shi
2025-06-05 22:10 ` [PATCH 6/7] mm: shmem: simplify error flow in thpsize_shmem_enabled_store() Kemeng Shi
2025-06-05 22:10 ` [PATCH 7/7] mm: shmem: eliminate unneeded page counting in shmem_unuse_swap_entries() Kemeng Shi

find likely ancestor, descendant, or conflicting patches for this message:
( dfblob:7b90cbeb4a1 dfblob:672844b94d3 dfblob:6b8ed201774
dfblob:9573b3a9e8c dfblob:f1062910a4d dfblob:2349673b239 )
 OR (
bs:"[PATCH 3/7] mm: shmem: avoid missing entries in shmem_undo_range() when entries was splited concurrently" )
	(help)

Reply instructions:

You may reply publicly to this message via plain-text email
using any one of the following methods:

* Save the following mbox file, import it into your mail client,
  and reply-to-all from there: mbox

  Avoid top-posting and favor interleaved quoting:
  https://en.wikipedia.org/wiki/Posting_style#Interleaved_style

* Reply using the --to, --cc, and --in-reply-to
  switches of git-send-email(1):

  git send-email \
    --in-reply-to=20250605221037.7872-4-shikemeng@huaweicloud.com \
    --to=shikemeng@huaweicloud.com \
    --cc=akpm@linux-foundation.org \
    --cc=baolin.wang@linux.alibaba.com \
    --cc=hughd@google.com \
    --cc=linux-fsdevel@vger.kernel.org \
    --cc=linux-kernel@vger.kernel.org \
    --cc=linux-mm@kvack.org \
    --cc=willy@infradead.org \
    /path/to/YOUR_REPLY

  https://kernel.org/pub/software/scm/git/docs/git-send-email.html

* If your mail client supports setting the In-Reply-To header
  via mailto: links, try the mailto: link

Be sure your reply has a Subject: header at the top and a blank line before the message body.

This is a public inbox, see mirroring instructions
for how to clone and mirror all data and code used for this inbox;
as well as URLs for NNTP newsgroup(s).