From: Kemeng Shi <shikemeng@huaweicloud.com>
To: hughd@google.com, baolin.wang@linux.alibaba.com,
willy@infradead.org, akpm@linux-foundation.org
Cc: linux-mm@kvack.org, linux-kernel@vger.kernel.org,
linux-fsdevel@vger.kernel.org
Subject: [PATCH 3/7] mm: shmem: avoid missing entries in shmem_undo_range() when entries was splited concurrently
Date: Fri, 6 Jun 2025 06:10:33 +0800 [thread overview]
Message-ID: <20250605221037.7872-4-shikemeng@huaweicloud.com> (raw)
In-Reply-To: <20250605221037.7872-1-shikemeng@huaweicloud.com>
If large swap entry or large folio entry returned from find_get_entries()
is splited before it is truncated, only the first splited entry will be
truncated, leaving the remaining splited entries un-truncated.
To address this issue, we introduce a new helper function
shmem_find_get_entries() which is similar to find_get_entries() but it will
also return order of found entries. Then we can detect entry splitting
after initial search by comparing current entry order with order returned
from shmem_find_get_entries() and retry finding entries if the split is
detectted to fix the issue.
The large swap entry related race was introduced in 12885cbe88dd ("mm:
shmem: split large entry if the swapin folio is not large"). The large
folio related race seems a long-standing issue which may be related to
conversion to xarray, conversion to folio and other changes. As a result,
it's hard to track down the specific commit that directly introduced this
issue.
Signed-off-by: Kemeng Shi <shikemeng@huaweicloud.com>
---
mm/filemap.c | 2 +-
mm/internal.h | 2 ++
mm/shmem.c | 81 ++++++++++++++++++++++++++++++++++++++++++---------
3 files changed, 70 insertions(+), 15 deletions(-)
diff --git a/mm/filemap.c b/mm/filemap.c
index 7b90cbeb4a1a..672844b94d3a 100644
--- a/mm/filemap.c
+++ b/mm/filemap.c
@@ -2015,7 +2015,7 @@ struct folio *__filemap_get_folio(struct address_space *mapping, pgoff_t index,
}
EXPORT_SYMBOL(__filemap_get_folio);
-static inline struct folio *find_get_entry(struct xa_state *xas, pgoff_t max,
+struct folio *find_get_entry(struct xa_state *xas, pgoff_t max,
xa_mark_t mark)
{
struct folio *folio;
diff --git a/mm/internal.h b/mm/internal.h
index 6b8ed2017743..9573b3a9e8c0 100644
--- a/mm/internal.h
+++ b/mm/internal.h
@@ -446,6 +446,8 @@ static inline void force_page_cache_readahead(struct address_space *mapping,
force_page_cache_ra(&ractl, nr_to_read);
}
+struct folio *find_get_entry(struct xa_state *xas, pgoff_t max,
+ xa_mark_t mark);
unsigned find_lock_entries(struct address_space *mapping, pgoff_t *start,
pgoff_t end, struct folio_batch *fbatch, pgoff_t *indices);
unsigned find_get_entries(struct address_space *mapping, pgoff_t *start,
diff --git a/mm/shmem.c b/mm/shmem.c
index f1062910a4de..2349673b239b 100644
--- a/mm/shmem.c
+++ b/mm/shmem.c
@@ -949,18 +949,29 @@ static void shmem_delete_from_page_cache(struct folio *folio, void *radswap)
* the number of pages being freed. 0 means entry not found in XArray (0 pages
* being freed).
*/
-static long shmem_free_swap(struct address_space *mapping,
- pgoff_t index, void *radswap)
+static long shmem_free_swap(struct address_space *mapping, pgoff_t index,
+ int order, void *radswap)
{
- int order = xa_get_order(&mapping->i_pages, index);
+ int old_order;
void *old;
- old = xa_cmpxchg_irq(&mapping->i_pages, index, radswap, NULL, 0);
- if (old != radswap)
+ xa_lock_irq(&mapping->i_pages);
+ old_order = xa_get_order(&mapping->i_pages, index);
+ /* free swap anyway if input order is -1 */
+ if (order != -1 && old_order != order) {
+ xa_unlock_irq(&mapping->i_pages);
+ return 0;
+ }
+
+ old = __xa_cmpxchg(&mapping->i_pages, index, radswap, NULL, 0);
+ if (old != radswap) {
+ xa_unlock_irq(&mapping->i_pages);
return 0;
- free_swap_and_cache_nr(radix_to_swp_entry(radswap), 1 << order);
+ }
+ xa_unlock_irq(&mapping->i_pages);
- return 1 << order;
+ free_swap_and_cache_nr(radix_to_swp_entry(radswap), 1 << old_order);
+ return 1 << old_order;
}
/*
@@ -1077,6 +1088,39 @@ static struct folio *shmem_get_partial_folio(struct inode *inode, pgoff_t index)
return folio;
}
+/*
+ * Similar to find_get_entries(), but will return order of found entries
+ */
+static unsigned shmem_find_get_entries(struct address_space *mapping,
+ pgoff_t *start, pgoff_t end, struct folio_batch *fbatch,
+ pgoff_t *indices, int *orders)
+{
+ XA_STATE(xas, &mapping->i_pages, *start);
+ struct folio *folio;
+
+ rcu_read_lock();
+ while ((folio = find_get_entry(&xas, end, XA_PRESENT)) != NULL) {
+ indices[fbatch->nr] = xas.xa_index;
+ if (!xa_is_value(folio))
+ orders[fbatch->nr] = folio_order(folio);
+ else
+ orders[fbatch->nr] = xas_get_order(&xas);
+ if (!folio_batch_add(fbatch, folio))
+ break;
+ }
+
+ if (folio_batch_count(fbatch)) {
+ unsigned long nr;
+ int idx = folio_batch_count(fbatch) - 1;
+
+ nr = 1 << orders[idx];
+ *start = round_down(indices[idx] + nr, nr);
+ }
+ rcu_read_unlock();
+
+ return folio_batch_count(fbatch);
+}
+
/*
* Remove range of pages and swap entries from page cache, and free them.
* If !unfalloc, truncate or punch hole; if unfalloc, undo failed fallocate.
@@ -1090,6 +1134,7 @@ static void shmem_undo_range(struct inode *inode, loff_t lstart, loff_t lend,
pgoff_t end = (lend + 1) >> PAGE_SHIFT;
struct folio_batch fbatch;
pgoff_t indices[PAGEVEC_SIZE];
+ int orders[PAGEVEC_SIZE];
struct folio *folio;
bool same_folio;
long nr_swaps_freed = 0;
@@ -1113,7 +1158,7 @@ static void shmem_undo_range(struct inode *inode, loff_t lstart, loff_t lend,
if (unfalloc)
continue;
nr_swaps_freed += shmem_free_swap(mapping,
- indices[i], folio);
+ indices[i], -1, folio);
continue;
}
@@ -1166,8 +1211,8 @@ static void shmem_undo_range(struct inode *inode, loff_t lstart, loff_t lend,
while (index < end) {
cond_resched();
- if (!find_get_entries(mapping, &index, end - 1, &fbatch,
- indices)) {
+ if (!shmem_find_get_entries(mapping, &index, end - 1, &fbatch,
+ indices, orders)) {
/* If all gone or hole-punch or unfalloc, we're done */
if (index == start || end != -1)
break;
@@ -1183,9 +1228,13 @@ static void shmem_undo_range(struct inode *inode, loff_t lstart, loff_t lend,
if (unfalloc)
continue;
- swaps_freed = shmem_free_swap(mapping, indices[i], folio);
+ swaps_freed = shmem_free_swap(mapping,
+ indices[i], orders[i], folio);
+ /*
+ * Swap was replaced by page or was
+ * splited: retry
+ */
if (!swaps_freed) {
- /* Swap was replaced by page: retry */
index = indices[i];
break;
}
@@ -1196,8 +1245,12 @@ static void shmem_undo_range(struct inode *inode, loff_t lstart, loff_t lend,
folio_lock(folio);
if (!unfalloc || !folio_test_uptodate(folio)) {
- if (folio_mapping(folio) != mapping) {
- /* Page was replaced by swap: retry */
+ /*
+ * Page was replaced by swap or was
+ * splited: retry
+ */
+ if (folio_mapping(folio) != mapping ||
+ folio_order(folio) != orders[i]) {
folio_unlock(folio);
index = indices[i];
break;
--
2.30.0
next prev parent reply other threads:[~2025-06-05 13:17 UTC|newest]
Thread overview: 24+ messages / expand[flat|nested] mbox.gz Atom feed top
2025-06-05 22:10 [PATCH 0/7] Some random fixes and cleanups to shmem Kemeng Shi
2025-06-05 20:02 ` Andrew Morton
2025-06-05 22:10 ` [PATCH 1/7] mm: shmem: correctly pass alloced parameter to shmem_recalc_inode() to avoid WARN_ON() Kemeng Shi
2025-06-05 19:57 ` Andrew Morton
2025-06-06 1:11 ` Kemeng Shi
2025-06-06 1:28 ` Andrew Morton
2025-06-06 2:29 ` Kemeng Shi
2025-06-06 1:31 ` Kemeng Shi
2025-06-07 6:11 ` Baolin Wang
2025-06-09 0:46 ` Kemeng Shi
2025-06-10 1:02 ` Kemeng Shi
2025-06-11 7:29 ` Baolin Wang
2025-06-11 8:38 ` Kemeng Shi
2025-06-05 22:10 ` [PATCH 2/7] mm: shmem: avoid setting error on splited entries in shmem_set_folio_swapin_error() Kemeng Shi
2025-06-07 6:20 ` Baolin Wang
2025-06-09 1:19 ` Kemeng Shi
2025-06-11 7:41 ` Baolin Wang
2025-06-11 9:11 ` Kemeng Shi
2025-06-13 6:56 ` Baolin Wang
2025-06-05 22:10 ` Kemeng Shi [this message]
2025-06-05 22:10 ` [PATCH 4/7] mm: shmem: handle special case of shmem_recalc_inode() in it's caller Kemeng Shi
2025-06-05 22:10 ` [PATCH 5/7] mm: shmem: wrap additional shmem quota related code with CONFIG_TMPFS_QUOTA Kemeng Shi
2025-06-05 22:10 ` [PATCH 6/7] mm: shmem: simplify error flow in thpsize_shmem_enabled_store() Kemeng Shi
2025-06-05 22:10 ` [PATCH 7/7] mm: shmem: eliminate unneeded page counting in shmem_unuse_swap_entries() Kemeng Shi
Reply instructions:
You may reply publicly to this message via plain-text email
using any one of the following methods:
* Save the following mbox file, import it into your mail client,
and reply-to-all from there: mbox
Avoid top-posting and favor interleaved quoting:
https://en.wikipedia.org/wiki/Posting_style#Interleaved_style
* Reply using the --to, --cc, and --in-reply-to
switches of git-send-email(1):
git send-email \
--in-reply-to=20250605221037.7872-4-shikemeng@huaweicloud.com \
--to=shikemeng@huaweicloud.com \
--cc=akpm@linux-foundation.org \
--cc=baolin.wang@linux.alibaba.com \
--cc=hughd@google.com \
--cc=linux-fsdevel@vger.kernel.org \
--cc=linux-kernel@vger.kernel.org \
--cc=linux-mm@kvack.org \
--cc=willy@infradead.org \
/path/to/YOUR_REPLY
https://kernel.org/pub/software/scm/git/docs/git-send-email.html
* If your mail client supports setting the In-Reply-To header
via mailto: links, try the mailto: link
Be sure your reply has a Subject: header at the top and a blank line
before the message body.
This is a public inbox, see mirroring instructions
for how to clone and mirror all data and code used for this inbox;
as well as URLs for NNTP newsgroup(s).