* [PATCH 1/6] shmem: provide a shmem_write_folio wrapper
2026-05-15 12:00 RFC: better block swap batching and a different take on swap_ops Christoph Hellwig
@ 2026-05-15 12:00 ` Christoph Hellwig
2026-05-15 12:00 ` [PATCH 2/6] mm: merge writeout into pageout Christoph Hellwig
` (4 subsequent siblings)
5 siblings, 0 replies; 7+ messages in thread
From: Christoph Hellwig @ 2026-05-15 12:00 UTC (permalink / raw)
Cc: baoquan.he, akpm, chrisl, usama.arif, kasong, nphamcs, shikemeng,
youngjun.park, linux-mm
Provide a wrapper for the shmem abuses in drm to preparare for swap I/O
refactoring by keepin swap_iocb handling entirely contained in mm/.
Signed-off-by: Christoph Hellwig <hch@lst.de>
---
drivers/gpu/drm/i915/gem/i915_gem_shmem.c | 2 +-
drivers/gpu/drm/ttm/ttm_backup.c | 2 +-
include/linux/shmem_fs.h | 5 +----
mm/shmem.c | 7 ++++++-
mm/swap.h | 4 ++++
5 files changed, 13 insertions(+), 7 deletions(-)
diff --git a/drivers/gpu/drm/i915/gem/i915_gem_shmem.c b/drivers/gpu/drm/i915/gem/i915_gem_shmem.c
index 06543ae60706..ef9440166295 100644
--- a/drivers/gpu/drm/i915/gem/i915_gem_shmem.c
+++ b/drivers/gpu/drm/i915/gem/i915_gem_shmem.c
@@ -325,7 +325,7 @@ void __shmem_writeback(size_t size, struct address_space *mapping)
if (folio_mapped(folio))
folio_redirty_for_writepage(&wbc, folio);
else
- error = shmem_writeout(folio, NULL, NULL);
+ error = shmem_write_folio(folio);
}
}
diff --git a/drivers/gpu/drm/ttm/ttm_backup.c b/drivers/gpu/drm/ttm/ttm_backup.c
index 81df4cb5606b..c5b813a563e7 100644
--- a/drivers/gpu/drm/ttm/ttm_backup.c
+++ b/drivers/gpu/drm/ttm/ttm_backup.c
@@ -117,7 +117,7 @@ ttm_backup_backup_page(struct file *backup, struct page *page,
if (writeback && !folio_mapped(to_folio) &&
folio_clear_dirty_for_io(to_folio)) {
folio_set_reclaim(to_folio);
- ret = shmem_writeout(to_folio, NULL, NULL);
+ ret = shmem_write_folio(to_folio);
if (!folio_test_writeback(to_folio))
folio_clear_reclaim(to_folio);
/*
diff --git a/include/linux/shmem_fs.h b/include/linux/shmem_fs.h
index 93a0ba872ebe..ab404effa879 100644
--- a/include/linux/shmem_fs.h
+++ b/include/linux/shmem_fs.h
@@ -12,8 +12,6 @@
#include <linux/userfaultfd_k.h>
#include <linux/bits.h>
-struct swap_iocb;
-
/* inode in-kernel data */
#ifdef CONFIG_TMPFS_QUOTA
@@ -122,8 +120,7 @@ static inline bool shmem_mapping(const struct address_space *mapping)
void shmem_unlock_mapping(struct address_space *mapping);
struct page *shmem_read_mapping_page_gfp(struct address_space *mapping,
pgoff_t index, gfp_t gfp_mask);
-int shmem_writeout(struct folio *folio, struct swap_iocb **plug,
- struct list_head *folio_list);
+int shmem_write_folio(struct folio *folio);
void shmem_truncate_range(struct inode *inode, loff_t start, uoff_t end);
int shmem_unuse(unsigned int type);
diff --git a/mm/shmem.c b/mm/shmem.c
index 3b5dc21b323c..b8becbd4beaf 100644
--- a/mm/shmem.c
+++ b/mm/shmem.c
@@ -1738,7 +1738,12 @@ int shmem_writeout(struct folio *folio, struct swap_iocb **plug,
folio_mark_dirty(folio);
return AOP_WRITEPAGE_ACTIVATE; /* Return with folio locked */
}
-EXPORT_SYMBOL_GPL(shmem_writeout);
+
+int shmem_write_folio(struct folio *folio)
+{
+ return shmem_writeout(folio, NULL, NULL);
+}
+EXPORT_SYMBOL_GPL(shmem_write_folio);
#if defined(CONFIG_NUMA) && defined(CONFIG_TMPFS)
static void shmem_show_mpol(struct seq_file *seq, struct mempolicy *mpol)
diff --git a/mm/swap.h b/mm/swap.h
index a77016f2423b..b6db72fb9879 100644
--- a/mm/swap.h
+++ b/mm/swap.h
@@ -499,4 +499,8 @@ static inline int non_swapcache_batch(swp_entry_t entry, int max_nr)
return 0;
}
#endif /* CONFIG_SWAP */
+
+int shmem_writeout(struct folio *folio, struct swap_iocb **plug,
+ struct list_head *folio_list);
+
#endif /* _MM_SWAP_H */
--
2.53.0
^ permalink raw reply related [flat|nested] 7+ messages in thread* [PATCH 2/6] mm: merge writeout into pageout
2026-05-15 12:00 RFC: better block swap batching and a different take on swap_ops Christoph Hellwig
2026-05-15 12:00 ` [PATCH 1/6] shmem: provide a shmem_write_folio wrapper Christoph Hellwig
@ 2026-05-15 12:00 ` Christoph Hellwig
2026-05-15 12:00 ` [PATCH 3/6] mm/swap: intoduce struct swap_io_ctx Christoph Hellwig
` (3 subsequent siblings)
5 siblings, 0 replies; 7+ messages in thread
From: Christoph Hellwig @ 2026-05-15 12:00 UTC (permalink / raw)
Cc: baoquan.he, akpm, chrisl, usama.arif, kasong, nphamcs, shikemeng,
youngjun.park, linux-mm
writeout is only called from pageout, and a straight flow at the end, so
merge the two functions.
Signed-off-by: Christoph Hellwig <hch@lst.de>
---
mm/vmscan.c | 63 ++++++++++++++++++++++++-----------------------------
1 file changed, 29 insertions(+), 34 deletions(-)
diff --git a/mm/vmscan.c b/mm/vmscan.c
index bd1b1aa12581..dc0d4312ac6c 100644
--- a/mm/vmscan.c
+++ b/mm/vmscan.c
@@ -614,45 +614,14 @@ typedef enum {
PAGE_CLEAN,
} pageout_t;
-static pageout_t writeout(struct folio *folio, struct address_space *mapping,
- struct swap_iocb **plug, struct list_head *folio_list)
-{
- int res;
-
- folio_set_reclaim(folio);
-
- /*
- * The large shmem folio can be split if CONFIG_THP_SWAP is not enabled
- * or we failed to allocate contiguous swap entries, in which case
- * the split out folios get added back to folio_list.
- */
- if (shmem_mapping(mapping))
- res = shmem_writeout(folio, plug, folio_list);
- else
- res = swap_writeout(folio, plug);
-
- if (res < 0)
- handle_write_error(mapping, folio, res);
- if (res == AOP_WRITEPAGE_ACTIVATE) {
- folio_clear_reclaim(folio);
- return PAGE_ACTIVATE;
- }
-
- /* synchronous write? */
- if (!folio_test_writeback(folio))
- folio_clear_reclaim(folio);
-
- trace_mm_vmscan_write_folio(folio);
- node_stat_add_folio(folio, NR_VMSCAN_WRITE);
- return PAGE_SUCCESS;
-}
-
/*
* pageout is called by shrink_folio_list() for each dirty folio.
*/
static pageout_t pageout(struct folio *folio, struct address_space *mapping,
struct swap_iocb **plug, struct list_head *folio_list)
{
+ int res;
+
/*
* We no longer attempt to writeback filesystem folios here, other
* than tmpfs/shmem. That's taken care of in page-writeback.
@@ -676,7 +645,33 @@ static pageout_t pageout(struct folio *folio, struct address_space *mapping,
return PAGE_ACTIVATE;
if (!folio_clear_dirty_for_io(folio))
return PAGE_CLEAN;
- return writeout(folio, mapping, plug, folio_list);
+
+ folio_set_reclaim(folio);
+
+ /*
+ * The large shmem folio can be split if CONFIG_THP_SWAP is not enabled
+ * or we failed to allocate contiguous swap entries, in which case
+ * the split out folios get added back to folio_list.
+ */
+ if (shmem_mapping(mapping))
+ res = shmem_writeout(folio, plug, folio_list);
+ else
+ res = swap_writeout(folio, plug);
+
+ if (res < 0)
+ handle_write_error(mapping, folio, res);
+ if (res == AOP_WRITEPAGE_ACTIVATE) {
+ folio_clear_reclaim(folio);
+ return PAGE_ACTIVATE;
+ }
+
+ /* synchronous write? */
+ if (!folio_test_writeback(folio))
+ folio_clear_reclaim(folio);
+
+ trace_mm_vmscan_write_folio(folio);
+ node_stat_add_folio(folio, NR_VMSCAN_WRITE);
+ return PAGE_SUCCESS;
}
/*
--
2.53.0
^ permalink raw reply related [flat|nested] 7+ messages in thread* [PATCH 3/6] mm/swap: intoduce struct swap_io_ctx
2026-05-15 12:00 RFC: better block swap batching and a different take on swap_ops Christoph Hellwig
2026-05-15 12:00 ` [PATCH 1/6] shmem: provide a shmem_write_folio wrapper Christoph Hellwig
2026-05-15 12:00 ` [PATCH 2/6] mm: merge writeout into pageout Christoph Hellwig
@ 2026-05-15 12:00 ` Christoph Hellwig
2026-05-15 12:00 ` [PATCH 4/6] mm/swap: also use struct swap_iocb for block I/O Christoph Hellwig
` (2 subsequent siblings)
5 siblings, 0 replies; 7+ messages in thread
From: Christoph Hellwig @ 2026-05-15 12:00 UTC (permalink / raw)
Cc: baoquan.he, akpm, chrisl, usama.arif, kasong, nphamcs, shikemeng,
youngjun.park, linux-mm
Generalize the context currently provided by double pointers to struct
swap_iocb to an on-stack context. This cleans up the code and prepares
for adding more fields and supporting batching multiple folios into a
single bio for block-based swap as well.
Signed-off-by: Christoph Hellwig <hch@lst.de>
---
mm/madvise.c | 16 +++++++--------
mm/page_io.c | 54 +++++++++++++++++++++++++++----------------------
mm/shmem.c | 13 ++++++++----
mm/swap.h | 36 ++++++++++++++-------------------
mm/swap_state.c | 40 +++++++++++++++++++-----------------
mm/vmscan.c | 15 +++++++-------
6 files changed, 91 insertions(+), 83 deletions(-)
diff --git a/mm/madvise.c b/mm/madvise.c
index 69708e953cf5..9ca82af8799a 100644
--- a/mm/madvise.c
+++ b/mm/madvise.c
@@ -188,7 +188,7 @@ static int swapin_walk_pmd_entry(pmd_t *pmd, unsigned long start,
unsigned long end, struct mm_walk *walk)
{
struct vm_area_struct *vma = walk->private;
- struct swap_iocb *splug = NULL;
+ struct swap_io_ctx ctx = {};
pte_t *ptep = NULL;
spinlock_t *ptl;
unsigned long addr;
@@ -212,15 +212,15 @@ static int swapin_walk_pmd_entry(pmd_t *pmd, unsigned long start,
pte_unmap_unlock(ptep, ptl);
ptep = NULL;
- folio = read_swap_cache_async(entry, GFP_HIGHUSER_MOVABLE,
- vma, addr, &splug);
+ folio = read_swap_cache_async(&ctx, entry, GFP_HIGHUSER_MOVABLE,
+ vma, addr);
if (folio)
folio_put(folio);
}
if (ptep)
pte_unmap_unlock(ptep, ptl);
- swap_read_unplug(splug);
+ swap_read_submit(&ctx);
cond_resched();
return 0;
@@ -238,7 +238,7 @@ static void shmem_swapin_range(struct vm_area_struct *vma,
XA_STATE(xas, &mapping->i_pages, linear_page_index(vma, start));
pgoff_t end_index = linear_page_index(vma, end) - 1;
struct folio *folio;
- struct swap_iocb *splug = NULL;
+ struct swap_io_ctx ctx = {};
rcu_read_lock();
xas_for_each(&xas, folio, end_index) {
@@ -257,15 +257,15 @@ static void shmem_swapin_range(struct vm_area_struct *vma,
xas_pause(&xas);
rcu_read_unlock();
- folio = read_swap_cache_async(entry, mapping_gfp_mask(mapping),
- vma, addr, &splug);
+ folio = read_swap_cache_async(&ctx, entry,
+ mapping_gfp_mask(mapping), vma, addr);
if (folio)
folio_put(folio);
rcu_read_lock();
}
rcu_read_unlock();
- swap_read_unplug(splug);
+ swap_read_submit(&ctx);
}
#endif /* CONFIG_SWAP */
diff --git a/mm/page_io.c b/mm/page_io.c
index 70cea9e24d2f..a78efc9909c8 100644
--- a/mm/page_io.c
+++ b/mm/page_io.c
@@ -237,7 +237,7 @@ static void swap_zeromap_folio_clear(struct folio *folio)
* We may have stale swap cache pages in memory: notice
* them here and get rid of the unnecessary final write.
*/
-int swap_writeout(struct folio *folio, struct swap_iocb **swap_plug)
+int swap_writeout(struct swap_io_ctx *ctx, struct folio *folio)
{
int ret = 0;
@@ -285,7 +285,7 @@ int swap_writeout(struct folio *folio, struct swap_iocb **swap_plug)
}
rcu_read_unlock();
- __swap_writepage(folio, swap_plug);
+ __swap_writepage(ctx, folio);
return 0;
out_unlock:
folio_unlock(folio);
@@ -375,9 +375,9 @@ static void sio_write_complete(struct kiocb *iocb, long ret)
mempool_free(sio, sio_pool);
}
-static void swap_writepage_fs(struct folio *folio, struct swap_iocb **swap_plug)
+static void swap_writepage_fs(struct swap_io_ctx *ctx, struct folio *folio)
{
- struct swap_iocb *sio = swap_plug ? *swap_plug : NULL;
+ struct swap_iocb *sio = ctx->sio;
struct swap_info_struct *sis = __swap_entry_to_info(folio->swap);
struct file *swap_file = sis->swap_file;
loff_t pos = swap_dev_pos(folio->swap);
@@ -388,7 +388,7 @@ static void swap_writepage_fs(struct folio *folio, struct swap_iocb **swap_plug)
if (sio) {
if (sio->iocb.ki_filp != swap_file ||
sio->iocb.ki_pos + sio->len != pos) {
- swap_write_unplug(sio);
+ swap_write_submit(ctx);
sio = NULL;
}
}
@@ -403,12 +403,11 @@ static void swap_writepage_fs(struct folio *folio, struct swap_iocb **swap_plug)
bvec_set_folio(&sio->bvec[sio->pages], folio, folio_size(folio), 0);
sio->len += folio_size(folio);
sio->pages += 1;
- if (sio->pages == ARRAY_SIZE(sio->bvec) || !swap_plug) {
- swap_write_unplug(sio);
+ if (sio->pages == ARRAY_SIZE(sio->bvec)) {
+ swap_write_submit(ctx);
sio = NULL;
}
- if (swap_plug)
- *swap_plug = sio;
+ ctx->sio = sio;
}
static void swap_writepage_bdev_sync(struct folio *folio,
@@ -448,7 +447,7 @@ static void swap_writepage_bdev_async(struct folio *folio,
submit_bio(bio);
}
-void __swap_writepage(struct folio *folio, struct swap_iocb **swap_plug)
+void __swap_writepage(struct swap_io_ctx *ctx, struct folio *folio)
{
struct swap_info_struct *sis = __swap_entry_to_info(folio->swap);
@@ -459,7 +458,7 @@ void __swap_writepage(struct folio *folio, struct swap_iocb **swap_plug)
* is safe.
*/
if (data_race(sis->flags & SWP_FS_OPS))
- swap_writepage_fs(folio, swap_plug);
+ swap_writepage_fs(ctx, folio);
/*
* ->flags can be updated non-atomically,
* but that will never affect SWP_SYNCHRONOUS_IO, so the data_race
@@ -471,16 +470,21 @@ void __swap_writepage(struct folio *folio, struct swap_iocb **swap_plug)
swap_writepage_bdev_async(folio, sis);
}
-void swap_write_unplug(struct swap_iocb *sio)
+void swap_write_submit(struct swap_io_ctx *ctx)
{
struct iov_iter from;
+ struct swap_iocb *sio = ctx->sio;
struct address_space *mapping = sio->iocb.ki_filp->f_mapping;
int ret;
+ if (!ctx)
+ return;
+
iov_iter_bvec(&from, ITER_SOURCE, sio->bvec, sio->pages, sio->len);
ret = mapping->a_ops->swap_rw(&sio->iocb, &from);
if (ret != -EIOCBQUEUED)
sio_write_complete(&sio->iocb, ret);
+ ctx->sio = NULL;
}
static void sio_read_complete(struct kiocb *iocb, long ret)
@@ -539,18 +543,16 @@ static bool swap_read_folio_zeromap(struct folio *folio)
return true;
}
-static void swap_read_folio_fs(struct folio *folio, struct swap_iocb **plug)
+static void swap_read_folio_fs(struct swap_io_ctx *ctx, struct folio *folio)
{
struct swap_info_struct *sis = __swap_entry_to_info(folio->swap);
- struct swap_iocb *sio = NULL;
+ struct swap_iocb *sio = ctx->sio;
loff_t pos = swap_dev_pos(folio->swap);
- if (plug)
- sio = *plug;
if (sio) {
if (sio->iocb.ki_filp != sis->swap_file ||
sio->iocb.ki_pos + sio->len != pos) {
- swap_read_unplug(sio);
+ swap_read_submit(ctx);
sio = NULL;
}
}
@@ -565,12 +567,11 @@ static void swap_read_folio_fs(struct folio *folio, struct swap_iocb **plug)
bvec_set_folio(&sio->bvec[sio->pages], folio, folio_size(folio), 0);
sio->len += folio_size(folio);
sio->pages += 1;
- if (sio->pages == ARRAY_SIZE(sio->bvec) || !plug) {
- swap_read_unplug(sio);
+ if (sio->pages == ARRAY_SIZE(sio->bvec)) {
+ swap_read_submit(ctx);
sio = NULL;
}
- if (plug)
- *plug = sio;
+ ctx->sio = sio;
}
static void swap_read_folio_bdev_sync(struct folio *folio,
@@ -610,7 +611,7 @@ static void swap_read_folio_bdev_async(struct folio *folio,
submit_bio(bio);
}
-void swap_read_folio(struct folio *folio, struct swap_iocb **plug)
+void swap_read_folio(struct swap_io_ctx *ctx, struct folio *folio)
{
struct swap_info_struct *sis = __swap_entry_to_info(folio->swap);
bool synchronous = sis->flags & SWP_SYNCHRONOUS_IO;
@@ -645,7 +646,7 @@ void swap_read_folio(struct folio *folio, struct swap_iocb **plug)
zswap_folio_swapin(folio);
if (data_race(sis->flags & SWP_FS_OPS)) {
- swap_read_folio_fs(folio, plug);
+ swap_read_folio_fs(ctx, folio);
} else if (synchronous) {
swap_read_folio_bdev_sync(folio, sis);
} else {
@@ -660,14 +661,19 @@ void swap_read_folio(struct folio *folio, struct swap_iocb **plug)
delayacct_swapin_end();
}
-void __swap_read_unplug(struct swap_iocb *sio)
+void swap_read_submit(struct swap_io_ctx *ctx)
{
struct iov_iter from;
+ struct swap_iocb *sio = ctx->sio;
struct address_space *mapping = sio->iocb.ki_filp->f_mapping;
int ret;
+ if (!sio)
+ return;
+
iov_iter_bvec(&from, ITER_DEST, sio->bvec, sio->pages, sio->len);
ret = mapping->a_ops->swap_rw(&sio->iocb, &from);
if (ret != -EIOCBQUEUED)
sio_read_complete(&sio->iocb, ret);
+ ctx->sio = NULL;
}
diff --git a/mm/shmem.c b/mm/shmem.c
index b8becbd4beaf..a9c1694d2755 100644
--- a/mm/shmem.c
+++ b/mm/shmem.c
@@ -1584,13 +1584,13 @@ int shmem_unuse(unsigned int type)
/**
* shmem_writeout - Write the folio to swap
+ * @plug: swap I/O context
* @folio: The folio to write
- * @plug: swap plug
* @folio_list: list to put back folios on split
*
* Move the folio from the page cache to the swap cache.
*/
-int shmem_writeout(struct folio *folio, struct swap_iocb **plug,
+int shmem_writeout(struct swap_io_ctx *ctx, struct folio *folio,
struct list_head *folio_list)
{
struct address_space *mapping = folio->mapping;
@@ -1702,7 +1702,7 @@ int shmem_writeout(struct folio *folio, struct swap_iocb **plug,
shmem_delete_from_page_cache(folio, swp_to_radix_entry(folio->swap));
BUG_ON(folio_mapped(folio));
- error = swap_writeout(folio, plug);
+ error = swap_writeout(ctx, folio);
if (error != AOP_WRITEPAGE_ACTIVATE) {
/* folio has been unlocked */
return error;
@@ -1741,7 +1741,12 @@ int shmem_writeout(struct folio *folio, struct swap_iocb **plug,
int shmem_write_folio(struct folio *folio)
{
- return shmem_writeout(folio, NULL, NULL);
+ struct swap_io_ctx ctx = {};
+ int err;
+
+ err = shmem_writeout(&ctx, folio, NULL);
+ swap_write_submit(&ctx);
+ return err;
}
EXPORT_SYMBOL_GPL(shmem_write_folio);
diff --git a/mm/swap.h b/mm/swap.h
index b6db72fb9879..3ec35b6d629f 100644
--- a/mm/swap.h
+++ b/mm/swap.h
@@ -4,7 +4,6 @@
#include <linux/atomic.h> /* for atomic_long_t */
struct mempolicy;
-struct swap_iocb;
extern int page_cluster;
@@ -54,6 +53,10 @@ enum swap_cluster_flags {
CLUSTER_FLAG_MAX,
};
+struct swap_io_ctx {
+ struct swap_iocb *sio;
+};
+
#ifdef CONFIG_SWAP
#include <linux/swapops.h> /* for swp_offset */
#include <linux/blk_types.h> /* for bio_end_io_t */
@@ -216,17 +219,11 @@ extern void __swap_cluster_free_entries(struct swap_info_struct *si,
/* linux/mm/page_io.c */
int sio_pool_init(void);
-struct swap_iocb;
-void swap_read_folio(struct folio *folio, struct swap_iocb **plug);
-void __swap_read_unplug(struct swap_iocb *plug);
-static inline void swap_read_unplug(struct swap_iocb *plug)
-{
- if (unlikely(plug))
- __swap_read_unplug(plug);
-}
-void swap_write_unplug(struct swap_iocb *sio);
-int swap_writeout(struct folio *folio, struct swap_iocb **swap_plug);
-void __swap_writepage(struct folio *folio, struct swap_iocb **swap_plug);
+void swap_read_folio(struct swap_io_ctx *ctx, struct folio *folio);
+void swap_read_submit(struct swap_io_ctx *ctx);
+void swap_write_submit(struct swap_io_ctx *ctx);
+int swap_writeout(struct swap_io_ctx *ctx, struct folio *folio);
+void __swap_writepage(struct swap_io_ctx *ctx, struct folio *folio);
/* linux/mm/swap_state.c */
extern struct address_space swap_space __read_mostly;
@@ -293,9 +290,8 @@ void __swap_cache_replace_folio(struct swap_cluster_info *ci,
void show_swap_cache_info(void);
void swapcache_clear(struct swap_info_struct *si, swp_entry_t entry, int nr);
-struct folio *read_swap_cache_async(swp_entry_t entry, gfp_t gfp_mask,
- struct vm_area_struct *vma, unsigned long addr,
- struct swap_iocb **plug);
+struct folio *read_swap_cache_async(struct swap_io_ctx *ctx, swp_entry_t entry,
+ gfp_t gfp_mask, struct vm_area_struct *vma, unsigned long addr);
struct folio *swap_cluster_readahead(swp_entry_t entry, gfp_t flag,
struct mempolicy *mpol, pgoff_t ilx);
struct folio *swapin_readahead(swp_entry_t entry, gfp_t flag,
@@ -353,7 +349,6 @@ static inline int non_swapcache_batch(swp_entry_t entry, int max_nr)
}
#else /* CONFIG_SWAP */
-struct swap_iocb;
static inline struct swap_cluster_info *swap_cluster_lock(
struct swap_info_struct *si, pgoff_t offset, bool irq)
{
@@ -399,11 +394,11 @@ static inline void folio_put_swap(struct folio *folio, struct page *page)
{
}
-static inline void swap_read_folio(struct folio *folio, struct swap_iocb **plug)
+static inline void swap_read_folio(struct swap_io_ctx *ctx, struct folio *folio)
{
}
-static inline void swap_write_unplug(struct swap_iocb *sio)
+static inline void swap_write_submit(struct swap_io_ctx *ctx)
{
}
@@ -443,8 +438,7 @@ static inline void swap_update_readahead(struct folio *folio,
{
}
-static inline int swap_writeout(struct folio *folio,
- struct swap_iocb **swap_plug)
+static inline int swap_writeout(struct swap_io_ctx *ctx, struct folio *folio)
{
return 0;
}
@@ -500,7 +494,7 @@ static inline int non_swapcache_batch(swp_entry_t entry, int max_nr)
}
#endif /* CONFIG_SWAP */
-int shmem_writeout(struct folio *folio, struct swap_iocb **plug,
+int shmem_writeout(struct swap_io_ctx *ctx, struct folio *folio,
struct list_head *folio_list);
#endif /* _MM_SWAP_H */
diff --git a/mm/swap_state.c b/mm/swap_state.c
index 1415a5c54a43..abc26414368d 100644
--- a/mm/swap_state.c
+++ b/mm/swap_state.c
@@ -573,14 +573,17 @@ struct folio *swap_cache_alloc_folio(swp_entry_t entry, gfp_t gfp_mask,
*/
struct folio *swapin_folio(swp_entry_t entry, struct folio *folio)
{
+ struct swap_io_ctx ctx = {};
struct folio *swapcache;
pgoff_t offset = swp_offset(entry);
unsigned long nr_pages = folio_nr_pages(folio);
entry = swp_entry(swp_type(entry), round_down(offset, nr_pages));
swapcache = __swap_cache_prepare_and_add(entry, folio, 0, true);
- if (swapcache == folio)
- swap_read_folio(folio, NULL);
+ if (swapcache == folio) {
+ swap_read_folio(&ctx, folio);
+ swap_read_submit(&ctx);
+ }
return swapcache;
}
@@ -590,9 +593,8 @@ struct folio *swapin_folio(swp_entry_t entry, struct folio *folio)
* A failure return means that either the page allocation failed or that
* the swap entry is no longer in use.
*/
-struct folio *read_swap_cache_async(swp_entry_t entry, gfp_t gfp_mask,
- struct vm_area_struct *vma, unsigned long addr,
- struct swap_iocb **plug)
+struct folio *read_swap_cache_async(struct swap_io_ctx *ctx, swp_entry_t entry,
+ gfp_t gfp_mask, struct vm_area_struct *vma, unsigned long addr)
{
struct swap_info_struct *si;
bool page_allocated;
@@ -610,7 +612,7 @@ struct folio *read_swap_cache_async(swp_entry_t entry, gfp_t gfp_mask,
mpol_cond_put(mpol);
if (page_allocated)
- swap_read_folio(folio, plug);
+ swap_read_folio(ctx, folio);
put_swap_device(si);
return folio;
@@ -704,8 +706,8 @@ struct folio *swap_cluster_readahead(swp_entry_t entry, gfp_t gfp_mask,
unsigned long start_offset, end_offset;
unsigned long mask;
struct swap_info_struct *si = __swap_entry_to_info(entry);
+ struct swap_io_ctx ctx = {};
struct blk_plug plug;
- struct swap_iocb *splug = NULL;
bool page_allocated;
mask = swapin_nr_pages(offset) - 1;
@@ -729,7 +731,7 @@ struct folio *swap_cluster_readahead(swp_entry_t entry, gfp_t gfp_mask,
if (!folio)
continue;
if (page_allocated) {
- swap_read_folio(folio, &splug);
+ swap_read_folio(&ctx, folio);
if (offset != entry_offset) {
folio_set_readahead(folio);
count_vm_event(SWAP_RA);
@@ -738,14 +740,15 @@ struct folio *swap_cluster_readahead(swp_entry_t entry, gfp_t gfp_mask,
folio_put(folio);
}
blk_finish_plug(&plug);
- swap_read_unplug(splug);
+ swap_read_submit(&ctx);
lru_add_drain(); /* Push any new pages onto the LRU now */
skip:
- /* The page was likely read above, so no need for plugging here */
folio = swap_cache_alloc_folio(entry, gfp_mask, mpol, ilx,
&page_allocated);
- if (unlikely(page_allocated))
- swap_read_folio(folio, NULL);
+ if (unlikely(page_allocated)) {
+ swap_read_folio(&ctx, folio);
+ swap_read_submit(&ctx);
+ }
return folio;
}
@@ -806,8 +809,8 @@ static int swap_vma_ra_win(struct vm_fault *vmf, unsigned long *start,
static struct folio *swap_vma_readahead(swp_entry_t targ_entry, gfp_t gfp_mask,
struct mempolicy *mpol, pgoff_t targ_ilx, struct vm_fault *vmf)
{
+ struct swap_io_ctx ctx = {};
struct blk_plug plug;
- struct swap_iocb *splug = NULL;
struct folio *folio;
pte_t *pte = NULL, pentry;
int win;
@@ -854,7 +857,7 @@ static struct folio *swap_vma_readahead(swp_entry_t targ_entry, gfp_t gfp_mask,
if (!folio)
continue;
if (page_allocated) {
- swap_read_folio(folio, &splug);
+ swap_read_folio(&ctx, folio);
if (addr != vmf->address) {
folio_set_readahead(folio);
count_vm_event(SWAP_RA);
@@ -865,14 +868,15 @@ static struct folio *swap_vma_readahead(swp_entry_t targ_entry, gfp_t gfp_mask,
if (pte)
pte_unmap(pte);
blk_finish_plug(&plug);
- swap_read_unplug(splug);
+ swap_read_submit(&ctx);
lru_add_drain();
skip:
- /* The folio was likely read above, so no need for plugging here */
folio = swap_cache_alloc_folio(targ_entry, gfp_mask, mpol, targ_ilx,
&page_allocated);
- if (unlikely(page_allocated))
- swap_read_folio(folio, NULL);
+ if (unlikely(page_allocated)) {
+ swap_read_folio(&ctx, folio);
+ swap_read_submit(&ctx);
+ }
return folio;
}
diff --git a/mm/vmscan.c b/mm/vmscan.c
index dc0d4312ac6c..56cd59e27447 100644
--- a/mm/vmscan.c
+++ b/mm/vmscan.c
@@ -617,8 +617,8 @@ typedef enum {
/*
* pageout is called by shrink_folio_list() for each dirty folio.
*/
-static pageout_t pageout(struct folio *folio, struct address_space *mapping,
- struct swap_iocb **plug, struct list_head *folio_list)
+static pageout_t pageout(struct swap_io_ctx *ctx, struct address_space *mapping,
+ struct folio *folio, struct list_head *folio_list)
{
int res;
@@ -654,9 +654,9 @@ static pageout_t pageout(struct folio *folio, struct address_space *mapping,
* the split out folios get added back to folio_list.
*/
if (shmem_mapping(mapping))
- res = shmem_writeout(folio, plug, folio_list);
+ res = shmem_writeout(ctx, folio, folio_list);
else
- res = swap_writeout(folio, plug);
+ res = swap_writeout(ctx, folio);
if (res < 0)
handle_write_error(mapping, folio, res);
@@ -1061,7 +1061,7 @@ static unsigned int shrink_folio_list(struct list_head *folio_list,
unsigned int nr_reclaimed = 0, nr_demoted = 0;
unsigned int pgactivate = 0;
bool do_demote_pass;
- struct swap_iocb *plug = NULL;
+ struct swap_io_ctx ctx = {};
folio_batch_init(&free_folios);
memset(stat, 0, sizeof(*stat));
@@ -1392,7 +1392,7 @@ static unsigned int shrink_folio_list(struct list_head *folio_list,
* starts and then write it out here.
*/
try_to_unmap_flush_dirty();
- switch (pageout(folio, mapping, &plug, folio_list)) {
+ switch (pageout(&ctx, mapping, folio, folio_list)) {
case PAGE_KEEP:
goto keep_locked;
case PAGE_ACTIVATE:
@@ -1582,8 +1582,7 @@ static unsigned int shrink_folio_list(struct list_head *folio_list,
list_splice(&ret_folios, folio_list);
count_vm_events(PGACTIVATE, pgactivate);
- if (plug)
- swap_write_unplug(plug);
+ swap_write_submit(&ctx);
return nr_reclaimed;
}
--
2.53.0
^ permalink raw reply related [flat|nested] 7+ messages in thread* [PATCH 4/6] mm/swap: also use struct swap_iocb for block I/O
2026-05-15 12:00 RFC: better block swap batching and a different take on swap_ops Christoph Hellwig
` (2 preceding siblings ...)
2026-05-15 12:00 ` [PATCH 3/6] mm/swap: intoduce struct swap_io_ctx Christoph Hellwig
@ 2026-05-15 12:00 ` Christoph Hellwig
2026-05-15 12:00 ` [PATCH 5/6] mm/swap: use swap_ops to register swap device's methods Christoph Hellwig
2026-05-15 12:00 ` [PATCH 6/6] mm/swap: remove SWP_FS_OPS Christoph Hellwig
5 siblings, 0 replies; 7+ messages in thread
From: Christoph Hellwig @ 2026-05-15 12:00 UTC (permalink / raw)
Cc: baoquan.he, akpm, chrisl, usama.arif, kasong, nphamcs, shikemeng,
youngjun.park, linux-mm
Block I/O benefits from batching just as much as remote file systems.
Extent struct swap_iocb to support building a bio on the fly as well,
and rewrite the block based swap code for it. This especially benefits
submit_bio based drivers that do not have the block plugging available,
but also saves allocating extra bios for blk-mq drivers.
Signed-off-by: Christoph Hellwig <hch@lst.de>
---
mm/page_io.c | 506 +++++++++++++++++++++++---------------------------
mm/swap.h | 1 +
mm/swapfile.c | 9 +-
3 files changed, 235 insertions(+), 281 deletions(-)
diff --git a/mm/page_io.c b/mm/page_io.c
index a78efc9909c8..bbd8cf47d20d 100644
--- a/mm/page_io.c
+++ b/mm/page_io.c
@@ -27,54 +27,6 @@
#include <linux/zswap.h>
#include "swap.h"
-static void __end_swap_bio_write(struct bio *bio)
-{
- struct folio *folio = bio_first_folio_all(bio);
-
- if (bio->bi_status) {
- /*
- * We failed to write the page out to swap-space.
- * Re-dirty the page in order to avoid it being reclaimed.
- * Also print a dire warning that things will go BAD (tm)
- * very quickly.
- *
- * Also clear PG_reclaim to avoid folio_rotate_reclaimable()
- */
- folio_mark_dirty(folio);
- pr_alert_ratelimited("Write-error on swap-device (%u:%u:%llu)\n",
- MAJOR(bio_dev(bio)), MINOR(bio_dev(bio)),
- (unsigned long long)bio->bi_iter.bi_sector);
- folio_clear_reclaim(folio);
- }
- folio_end_writeback(folio);
-}
-
-static void end_swap_bio_write(struct bio *bio)
-{
- __end_swap_bio_write(bio);
- bio_put(bio);
-}
-
-static void __end_swap_bio_read(struct bio *bio)
-{
- struct folio *folio = bio_first_folio_all(bio);
-
- if (bio->bi_status) {
- pr_alert_ratelimited("Read-error on swap-device (%u:%u:%llu)\n",
- MAJOR(bio_dev(bio)), MINOR(bio_dev(bio)),
- (unsigned long long)bio->bi_iter.bi_sector);
- } else {
- folio_mark_uptodate(folio);
- }
- folio_unlock(folio);
-}
-
-static void end_swap_bio_read(struct bio *bio)
-{
- __end_swap_bio_read(bio);
- bio_put(bio);
-}
-
int generic_swapfile_activate(struct swap_info_struct *sis,
struct file *swap_file,
sector_t *span)
@@ -325,9 +277,12 @@ static void bio_associate_blkg_from_page(struct bio *bio, struct folio *folio)
#endif /* CONFIG_MEMCG && CONFIG_BLK_CGROUP */
struct swap_iocb {
- struct kiocb iocb;
+ union {
+ struct kiocb iocb;
+ struct bio bio;
+ };
struct bio_vec bvec[SWAP_CLUSTER_MAX];
- int pages;
+ int nr_vecs;
int len;
};
static mempool_t *sio_pool;
@@ -345,172 +300,68 @@ int sio_pool_init(void)
return 0;
}
-static void sio_write_complete(struct kiocb *iocb, long ret)
+static bool swap_can_merge(struct swap_io_ctx *ctx, struct folio *folio)
{
- struct swap_iocb *sio = container_of(iocb, struct swap_iocb, iocb);
- struct page *page = sio->bvec[0].bv_page;
- int p;
+ struct swap_info_struct *sis = __swap_entry_to_info(folio->swap);
+ struct bio_vec *last_bv = &ctx->sio->bvec[ctx->sio->nr_vecs - 1];
+ struct folio *prev_folio = page_folio(last_bv->bv_page);
+ size_t prev_folio_size = folio_size(prev_folio);
- if (ret != sio->len) {
- /*
- * In the case of swap-over-nfs, this can be a
- * temporary failure if the system has limited
- * memory for allocating transmit buffers.
- * Mark the page dirty and avoid
- * folio_rotate_reclaimable but rate-limit the
- * messages.
- */
- pr_err_ratelimited("Write error %ld on dio swapfile (%llu)\n",
- ret, swap_dev_pos(page_swap_entry(page)));
- for (p = 0; p < sio->pages; p++) {
- page = sio->bvec[p].bv_page;
- set_page_dirty(page);
- ClearPageReclaim(page);
- }
- }
+ if (ctx->sis != sis)
+ return false;
- for (p = 0; p < sio->pages; p++)
- end_page_writeback(sio->bvec[p].bv_page);
+ if (sis->flags & SWP_FS_OPS) {
+ if (swap_dev_pos(folio->swap) !=
+ swap_dev_pos(prev_folio->swap) + prev_folio_size)
+ return false;
+ } else {
+ if (swap_folio_sector(folio) !=
+ swap_folio_sector(prev_folio) +
+ (prev_folio_size >> SECTOR_SHIFT))
+ return false;
+ }
- mempool_free(sio, sio_pool);
+ return true;
}
-static void swap_writepage_fs(struct swap_io_ctx *ctx, struct folio *folio)
+static void swap_add_page(struct swap_io_ctx *ctx, struct folio *folio, int rw)
{
- struct swap_iocb *sio = ctx->sio;
struct swap_info_struct *sis = __swap_entry_to_info(folio->swap);
- struct file *swap_file = sis->swap_file;
- loff_t pos = swap_dev_pos(folio->swap);
+ struct swap_iocb *sio = ctx->sio;
- count_swpout_vm_event(folio);
- folio_start_writeback(folio);
- folio_unlock(folio);
- if (sio) {
- if (sio->iocb.ki_filp != swap_file ||
- sio->iocb.ki_pos + sio->len != pos) {
+ if (sio && !swap_can_merge(ctx, folio)) {
+ if (rw == WRITE)
swap_write_submit(ctx);
- sio = NULL;
- }
+ else
+ swap_read_submit(ctx);
+ sio = ctx->sio;
}
+
if (!sio) {
- sio = mempool_alloc(sio_pool, GFP_NOIO);
- init_sync_kiocb(&sio->iocb, swap_file);
- sio->iocb.ki_complete = sio_write_complete;
- sio->iocb.ki_pos = pos;
- sio->pages = 0;
+ ctx->sis = sis;
+ ctx->sio = sio = mempool_alloc(sio_pool, GFP_NOIO);
+ sio->nr_vecs = 0;
sio->len = 0;
}
- bvec_set_folio(&sio->bvec[sio->pages], folio, folio_size(folio), 0);
+ bvec_set_folio(&sio->bvec[sio->nr_vecs], folio, folio_size(folio), 0);
sio->len += folio_size(folio);
- sio->pages += 1;
- if (sio->pages == ARRAY_SIZE(sio->bvec)) {
- swap_write_submit(ctx);
- sio = NULL;
+ sio->nr_vecs += 1;
+ if (sio->nr_vecs == ARRAY_SIZE(sio->bvec)) {
+ if (rw == WRITE)
+ swap_write_submit(ctx);
+ else
+ swap_read_submit(ctx);
}
- ctx->sio = sio;
}
-static void swap_writepage_bdev_sync(struct folio *folio,
- struct swap_info_struct *sis)
-{
- struct bio_vec bv;
- struct bio bio;
-
- bio_init(&bio, sis->bdev, &bv, 1, REQ_OP_WRITE | REQ_SWAP);
- bio.bi_iter.bi_sector = swap_folio_sector(folio);
- bio_add_folio_nofail(&bio, folio, folio_size(folio), 0);
-
- bio_associate_blkg_from_page(&bio, folio);
- count_swpout_vm_event(folio);
-
- folio_start_writeback(folio);
- folio_unlock(folio);
-
- submit_bio_wait(&bio);
- __end_swap_bio_write(&bio);
-}
-
-static void swap_writepage_bdev_async(struct folio *folio,
- struct swap_info_struct *sis)
+void __swap_writepage(struct swap_io_ctx *ctx, struct folio *folio)
{
- struct bio *bio;
-
- bio = bio_alloc(sis->bdev, 1, REQ_OP_WRITE | REQ_SWAP, GFP_NOIO);
- bio->bi_iter.bi_sector = swap_folio_sector(folio);
- bio->bi_end_io = end_swap_bio_write;
- bio_add_folio_nofail(bio, folio, folio_size(folio), 0);
+ VM_BUG_ON_FOLIO(!folio_test_swapcache(folio), folio);
- bio_associate_blkg_from_page(bio, folio);
count_swpout_vm_event(folio);
folio_start_writeback(folio);
folio_unlock(folio);
- submit_bio(bio);
-}
-
-void __swap_writepage(struct swap_io_ctx *ctx, struct folio *folio)
-{
- struct swap_info_struct *sis = __swap_entry_to_info(folio->swap);
-
- VM_BUG_ON_FOLIO(!folio_test_swapcache(folio), folio);
- /*
- * ->flags can be updated non-atomically,
- * but that will never affect SWP_FS_OPS, so the data_race
- * is safe.
- */
- if (data_race(sis->flags & SWP_FS_OPS))
- swap_writepage_fs(ctx, folio);
- /*
- * ->flags can be updated non-atomically,
- * but that will never affect SWP_SYNCHRONOUS_IO, so the data_race
- * is safe.
- */
- else if (data_race(sis->flags & SWP_SYNCHRONOUS_IO))
- swap_writepage_bdev_sync(folio, sis);
- else
- swap_writepage_bdev_async(folio, sis);
-}
-
-void swap_write_submit(struct swap_io_ctx *ctx)
-{
- struct iov_iter from;
- struct swap_iocb *sio = ctx->sio;
- struct address_space *mapping = sio->iocb.ki_filp->f_mapping;
- int ret;
-
- if (!ctx)
- return;
-
- iov_iter_bvec(&from, ITER_SOURCE, sio->bvec, sio->pages, sio->len);
- ret = mapping->a_ops->swap_rw(&sio->iocb, &from);
- if (ret != -EIOCBQUEUED)
- sio_write_complete(&sio->iocb, ret);
- ctx->sio = NULL;
-}
-
-static void sio_read_complete(struct kiocb *iocb, long ret)
-{
- struct swap_iocb *sio = container_of(iocb, struct swap_iocb, iocb);
- int p;
-
- if (ret == sio->len) {
- for (p = 0; p < sio->pages; p++) {
- struct folio *folio = page_folio(sio->bvec[p].bv_page);
-
- count_mthp_stat(folio_order(folio), MTHP_STAT_SWPIN);
- count_memcg_folio_events(folio, PSWPIN, folio_nr_pages(folio));
- folio_mark_uptodate(folio);
- folio_unlock(folio);
- }
- count_vm_events(PSWPIN, sio->len >> PAGE_SHIFT);
- } else {
- for (p = 0; p < sio->pages; p++) {
- struct folio *folio = page_folio(sio->bvec[p].bv_page);
-
- folio_unlock(folio);
- }
- pr_alert_ratelimited("Read-error on swap-device\n");
- }
- mempool_free(sio, sio_pool);
+ swap_add_page(ctx, folio, WRITE);
}
static bool swap_read_folio_zeromap(struct folio *folio)
@@ -543,74 +394,6 @@ static bool swap_read_folio_zeromap(struct folio *folio)
return true;
}
-static void swap_read_folio_fs(struct swap_io_ctx *ctx, struct folio *folio)
-{
- struct swap_info_struct *sis = __swap_entry_to_info(folio->swap);
- struct swap_iocb *sio = ctx->sio;
- loff_t pos = swap_dev_pos(folio->swap);
-
- if (sio) {
- if (sio->iocb.ki_filp != sis->swap_file ||
- sio->iocb.ki_pos + sio->len != pos) {
- swap_read_submit(ctx);
- sio = NULL;
- }
- }
- if (!sio) {
- sio = mempool_alloc(sio_pool, GFP_KERNEL);
- init_sync_kiocb(&sio->iocb, sis->swap_file);
- sio->iocb.ki_pos = pos;
- sio->iocb.ki_complete = sio_read_complete;
- sio->pages = 0;
- sio->len = 0;
- }
- bvec_set_folio(&sio->bvec[sio->pages], folio, folio_size(folio), 0);
- sio->len += folio_size(folio);
- sio->pages += 1;
- if (sio->pages == ARRAY_SIZE(sio->bvec)) {
- swap_read_submit(ctx);
- sio = NULL;
- }
- ctx->sio = sio;
-}
-
-static void swap_read_folio_bdev_sync(struct folio *folio,
- struct swap_info_struct *sis)
-{
- struct bio_vec bv;
- struct bio bio;
-
- bio_init(&bio, sis->bdev, &bv, 1, REQ_OP_READ);
- bio.bi_iter.bi_sector = swap_folio_sector(folio);
- bio_add_folio_nofail(&bio, folio, folio_size(folio), 0);
- /*
- * Keep this task valid during swap readpage because the oom killer may
- * attempt to access it in the page fault retry time check.
- */
- get_task_struct(current);
- count_mthp_stat(folio_order(folio), MTHP_STAT_SWPIN);
- count_memcg_folio_events(folio, PSWPIN, folio_nr_pages(folio));
- count_vm_events(PSWPIN, folio_nr_pages(folio));
- submit_bio_wait(&bio);
- __end_swap_bio_read(&bio);
- put_task_struct(current);
-}
-
-static void swap_read_folio_bdev_async(struct folio *folio,
- struct swap_info_struct *sis)
-{
- struct bio *bio;
-
- bio = bio_alloc(sis->bdev, 1, REQ_OP_READ, GFP_KERNEL);
- bio->bi_iter.bi_sector = swap_folio_sector(folio);
- bio->bi_end_io = end_swap_bio_read;
- bio_add_folio_nofail(bio, folio, folio_size(folio), 0);
- count_mthp_stat(folio_order(folio), MTHP_STAT_SWPIN);
- count_memcg_folio_events(folio, PSWPIN, folio_nr_pages(folio));
- count_vm_events(PSWPIN, folio_nr_pages(folio));
- submit_bio(bio);
-}
-
void swap_read_folio(struct swap_io_ctx *ctx, struct folio *folio)
{
struct swap_info_struct *sis = __swap_entry_to_info(folio->swap);
@@ -644,14 +427,7 @@ void swap_read_folio(struct swap_io_ctx *ctx, struct folio *folio)
/* We have to read from slower devices. Increase zswap protection. */
zswap_folio_swapin(folio);
-
- if (data_race(sis->flags & SWP_FS_OPS)) {
- swap_read_folio_fs(ctx, folio);
- } else if (synchronous) {
- swap_read_folio_bdev_sync(folio, sis);
- } else {
- swap_read_folio_bdev_async(folio, sis);
- }
+ swap_add_page(ctx, folio, READ);
finish:
if (workingset) {
@@ -661,19 +437,197 @@ void swap_read_folio(struct swap_io_ctx *ctx, struct folio *folio)
delayacct_swapin_end();
}
-void swap_read_submit(struct swap_io_ctx *ctx)
+static void sio_write_end(struct swap_iocb *sio, bool failed)
+{
+ int p;
+
+ for (p = 0; p < sio->nr_vecs; p++) {
+ struct page *page = sio->bvec[p].bv_page;
+
+ if (failed) {
+ set_page_dirty(page);
+ ClearPageReclaim(page);
+ }
+ end_page_writeback(page);
+ }
+ mempool_free(sio, sio_pool);
+}
+
+static void sio_write_complete(struct kiocb *iocb, long ret)
+{
+ struct swap_iocb *sio = container_of(iocb, struct swap_iocb, iocb);
+ bool failed = ret != sio->len;
+
+ if (failed) {
+ struct page *page = sio->bvec[0].bv_page;
+
+ /*
+ * In the case of swap-over-nfs, this can be a temporary failure
+ * if the system has limited memory for allocating transmit
+ * buffers. Mark the page dirty and avoid
+ * folio_rotate_reclaimable but rate-limit the messages.
+ */
+ pr_err_ratelimited("Write error %ld on dio swapfile (%llu)\n",
+ ret, swap_dev_pos(page_swap_entry(page)));
+ }
+
+ sio_write_end(sio, failed);
+}
+
+static void end_swap_bio_write(struct bio *bio)
+{
+ struct swap_iocb *sio = container_of(bio, struct swap_iocb, bio);
+ bool failed = !!bio->bi_status;
+
+ if (failed)
+ pr_alert_ratelimited("Write-error on swap-device (%u:%u:%llu)\n",
+ MAJOR(bio_dev(bio)), MINOR(bio_dev(bio)),
+ (unsigned long long)bio->bi_iter.bi_sector);
+ sio_write_end(sio, failed);
+}
+
+static void sio_read_end(struct swap_iocb *sio)
+{
+ int p;
+
+ for (p = 0; p < sio->nr_vecs; p++) {
+ struct folio *folio = page_folio(sio->bvec[p].bv_page);
+
+ count_mthp_stat(folio_order(folio), MTHP_STAT_SWPIN);
+ count_memcg_folio_events(folio, PSWPIN, folio_nr_pages(folio));
+ folio_mark_uptodate(folio);
+ folio_unlock(folio);
+ }
+ count_vm_events(PSWPIN, sio->len >> PAGE_SHIFT);
+ mempool_free(sio, sio_pool);
+}
+
+static void sio_read_fail(struct swap_iocb *sio)
+{
+ int p;
+
+ for (p = 0; p < sio->nr_vecs; p++)
+ folio_unlock(page_folio(sio->bvec[p].bv_page));
+ mempool_free(sio, sio_pool);
+}
+
+static void sio_read_complete(struct kiocb *iocb, long ret)
+{
+ struct swap_iocb *sio = container_of(iocb, struct swap_iocb, iocb);
+
+ if (ret != sio->len) {
+ pr_alert_ratelimited("Read-error on swap-device\n");
+ sio_read_fail(sio);
+ return;
+ }
+
+ sio_read_end(sio);
+}
+
+static void end_swap_bio_read(struct bio *bio)
+{
+ struct swap_iocb *sio = container_of(bio, struct swap_iocb, bio);
+
+ if (bio->bi_status) {
+ pr_alert_ratelimited("Read-error on swap-device (%u:%u:%llu)\n",
+ MAJOR(bio_dev(bio)), MINOR(bio_dev(bio)),
+ (unsigned long long)bio->bi_iter.bi_sector);
+ sio_read_fail(sio);
+ return;
+ }
+
+ sio_read_end(sio);
+}
+
+static void swap_bdev_submit_write(struct swap_io_ctx *ctx)
+{
+ struct swap_iocb *sio = ctx->sio;
+ struct bio *bio = &sio->bio;
+
+ bio_init(bio, ctx->sis->bdev, sio->bvec, ARRAY_SIZE(sio->bvec),
+ REQ_OP_WRITE | REQ_SWAP);
+ bio->bi_iter.bi_size = sio->len;
+ bio->bi_iter.bi_sector = swap_folio_sector(bio_first_folio_all(bio));
+ bio_associate_blkg_from_page(bio, bio_first_folio_all(bio));
+
+ if (ctx->sis->flags & SWP_SYNCHRONOUS_IO) {
+ submit_bio_wait(bio);
+ end_swap_bio_write(bio);
+ } else {
+ bio->bi_end_io = end_swap_bio_write;
+ submit_bio(bio);
+ }
+}
+
+static void swap_bdev_submit_read(struct swap_io_ctx *ctx)
+{
+ struct swap_iocb *sio = ctx->sio;
+ struct bio *bio = &sio->bio;
+
+ bio_init(bio, ctx->sis->bdev, sio->bvec, ARRAY_SIZE(sio->bvec),
+ REQ_OP_READ);
+ bio->bi_iter.bi_size = sio->len;
+ bio->bi_iter.bi_sector = swap_folio_sector(bio_first_folio_all(bio));
+
+ if (ctx->sis->flags & SWP_SYNCHRONOUS_IO) {
+ /*
+ * Keep this task valid during swap readpage because the oom
+ * killer may attempt to access it in the page fault retry
+ * time check.
+ */
+ get_task_struct(current);
+ submit_bio_wait(bio);
+ end_swap_bio_read(bio);
+ put_task_struct(current);
+ } else {
+ bio->bi_end_io = end_swap_bio_read;
+ submit_bio(bio);
+ }
+}
+
+static void swap_fs_submit(struct swap_io_ctx *ctx, int rw)
{
- struct iov_iter from;
struct swap_iocb *sio = ctx->sio;
struct address_space *mapping = sio->iocb.ki_filp->f_mapping;
+ struct iov_iter iter;
int ret;
- if (!sio)
- return;
+ init_sync_kiocb(&sio->iocb, ctx->sis->swap_file);
+ sio->iocb.ki_pos = swap_dev_pos(page_folio(sio->bvec[0].bv_page)->swap);
+ if (rw == WRITE)
+ sio->iocb.ki_complete = sio_write_complete;
+ else
+ sio->iocb.ki_complete = sio_read_complete;
- iov_iter_bvec(&from, ITER_DEST, sio->bvec, sio->pages, sio->len);
- ret = mapping->a_ops->swap_rw(&sio->iocb, &from);
+ iov_iter_bvec(&iter, rw == WRITE ? ITER_SOURCE : ITER_DEST,
+ sio->bvec, sio->nr_vecs, sio->len);
+ ret = mapping->a_ops->swap_rw(&sio->iocb, &iter);
if (ret != -EIOCBQUEUED)
- sio_read_complete(&sio->iocb, ret);
+ sio->iocb.ki_complete(&sio->iocb, ret);
+}
+
+void swap_write_submit(struct swap_io_ctx *ctx)
+{
+ if (!ctx->sio)
+ return;
+
+ if (ctx->sis->flags & SWP_FS_OPS)
+ swap_fs_submit(ctx, WRITE);
+ else
+ swap_bdev_submit_write(ctx);
+ ctx->sio = NULL;
+ ctx->sis = NULL;
+}
+
+void swap_read_submit(struct swap_io_ctx *ctx)
+{
+ if (!ctx->sio)
+ return;
+
+ if (ctx->sis->flags & SWP_FS_OPS)
+ swap_fs_submit(ctx, READ);
+ else
+ swap_bdev_submit_read(ctx);
ctx->sio = NULL;
+ ctx->sis = NULL;
}
diff --git a/mm/swap.h b/mm/swap.h
index 3ec35b6d629f..b359735be3c5 100644
--- a/mm/swap.h
+++ b/mm/swap.h
@@ -55,6 +55,7 @@ enum swap_cluster_flags {
struct swap_io_ctx {
struct swap_iocb *sio;
+ struct swap_info_struct *sis;
};
#ifdef CONFIG_SWAP
diff --git a/mm/swapfile.c b/mm/swapfile.c
index 9174f1eeffb0..27dbce0d1e1e 100644
--- a/mm/swapfile.c
+++ b/mm/swapfile.c
@@ -2781,6 +2781,10 @@ static int setup_swap_extents(struct swap_info_struct *sis,
struct inode *inode = mapping->host;
int ret;
+ ret = sio_pool_init();
+ if (ret)
+ return ret;
+
if (S_ISBLK(inode->i_mode)) {
ret = add_swap_extent(sis, 0, sis->max, 0);
*span = sis->pages;
@@ -2792,11 +2796,6 @@ static int setup_swap_extents(struct swap_info_struct *sis,
if (ret < 0)
return ret;
sis->flags |= SWP_ACTIVATED;
- if ((sis->flags & SWP_FS_OPS) &&
- sio_pool_init() != 0) {
- destroy_swap_extents(sis, swap_file);
- return -ENOMEM;
- }
return ret;
}
--
2.53.0
^ permalink raw reply related [flat|nested] 7+ messages in thread* [PATCH 5/6] mm/swap: use swap_ops to register swap device's methods
2026-05-15 12:00 RFC: better block swap batching and a different take on swap_ops Christoph Hellwig
` (3 preceding siblings ...)
2026-05-15 12:00 ` [PATCH 4/6] mm/swap: also use struct swap_iocb for block I/O Christoph Hellwig
@ 2026-05-15 12:00 ` Christoph Hellwig
2026-05-15 12:00 ` [PATCH 6/6] mm/swap: remove SWP_FS_OPS Christoph Hellwig
5 siblings, 0 replies; 7+ messages in thread
From: Christoph Hellwig @ 2026-05-15 12:00 UTC (permalink / raw)
Cc: baoquan.he, akpm, chrisl, usama.arif, kasong, nphamcs, shikemeng,
youngjun.park, linux-mm
From: Baoquan He <baoquan.he@linux.dev>
This simplifies codes and makes logic clearer. And also makes later any
new swap device type being added easier to handle.
Currently there are two types of swap devices: fs and bdev.
Suggested-by: Chris Li <chrisl@kernel.org>
Signed-off-by: Baoquan He <baoquan.he@linux.dev>
[hch: updated for the new submit and can_merge abstraction]
Signed-off-by: Christoph Hellwig <hch@lst.de>
---
include/linux/swap.h | 1 +
mm/page_io.c | 63 ++++++++++++++++++++++++++++----------------
mm/swap.h | 10 +++++++
mm/swapfile.c | 4 +++
4 files changed, 55 insertions(+), 23 deletions(-)
diff --git a/include/linux/swap.h b/include/linux/swap.h
index 7a09df6977a5..0da33b803348 100644
--- a/include/linux/swap.h
+++ b/include/linux/swap.h
@@ -282,6 +282,7 @@ struct swap_info_struct {
struct work_struct reclaim_work; /* reclaim worker */
struct list_head discard_clusters; /* discard clusters list */
struct plist_node avail_list; /* entry in swap_avail_head */
+ const struct swap_ops *ops;
};
static inline swp_entry_t page_swap_entry(struct page *page)
diff --git a/mm/page_io.c b/mm/page_io.c
index bbd8cf47d20d..4678a8af9f96 100644
--- a/mm/page_io.c
+++ b/mm/page_io.c
@@ -309,19 +309,7 @@ static bool swap_can_merge(struct swap_io_ctx *ctx, struct folio *folio)
if (ctx->sis != sis)
return false;
-
- if (sis->flags & SWP_FS_OPS) {
- if (swap_dev_pos(folio->swap) !=
- swap_dev_pos(prev_folio->swap) + prev_folio_size)
- return false;
- } else {
- if (swap_folio_sector(folio) !=
- swap_folio_sector(prev_folio) +
- (prev_folio_size >> SECTOR_SHIFT))
- return false;
- }
-
- return true;
+ return sis->ops->can_merge(folio, prev_folio, prev_folio_size);
}
static void swap_add_page(struct swap_io_ctx *ctx, struct folio *folio, int rw)
@@ -585,6 +573,20 @@ static void swap_bdev_submit_read(struct swap_io_ctx *ctx)
}
}
+static bool swap_bdev_can_merge(struct folio *folio, struct folio *prev_folio,
+ size_t prev_folio_size)
+{
+ return swap_folio_sector(folio) ==
+ swap_folio_sector(prev_folio) +
+ (prev_folio_size >> SECTOR_SHIFT);
+}
+
+const struct swap_ops swap_bdev_ops = {
+ .submit_write = swap_bdev_submit_write,
+ .submit_read = swap_bdev_submit_read,
+ .can_merge = swap_bdev_can_merge,
+};
+
static void swap_fs_submit(struct swap_io_ctx *ctx, int rw)
{
struct swap_iocb *sio = ctx->sio;
@@ -606,15 +608,34 @@ static void swap_fs_submit(struct swap_io_ctx *ctx, int rw)
sio->iocb.ki_complete(&sio->iocb, ret);
}
+static void swap_fs_submit_write(struct swap_io_ctx *ctx)
+{
+ swap_fs_submit(ctx, WRITE);
+}
+
+static void swap_fs_submit_read(struct swap_io_ctx *ctx)
+{
+ swap_fs_submit(ctx, READ);
+}
+
+static bool swap_fs_can_merge(struct folio *folio, struct folio *prev_folio,
+ size_t prev_folio_size)
+{
+ return swap_dev_pos(folio->swap) ==
+ swap_dev_pos(prev_folio->swap) + prev_folio_size;
+}
+
+const struct swap_ops swap_fs_ops = {
+ .submit_write = swap_fs_submit_write,
+ .submit_read = swap_fs_submit_read,
+ .can_merge = swap_fs_can_merge,
+};
+
void swap_write_submit(struct swap_io_ctx *ctx)
{
if (!ctx->sio)
return;
-
- if (ctx->sis->flags & SWP_FS_OPS)
- swap_fs_submit(ctx, WRITE);
- else
- swap_bdev_submit_write(ctx);
+ ctx->sis->ops->submit_write(ctx);
ctx->sio = NULL;
ctx->sis = NULL;
}
@@ -623,11 +644,7 @@ void swap_read_submit(struct swap_io_ctx *ctx)
{
if (!ctx->sio)
return;
-
- if (ctx->sis->flags & SWP_FS_OPS)
- swap_fs_submit(ctx, READ);
- else
- swap_bdev_submit_read(ctx);
+ ctx->sis->ops->submit_read(ctx);
ctx->sio = NULL;
ctx->sis = NULL;
}
diff --git a/mm/swap.h b/mm/swap.h
index b359735be3c5..aaf774fd03b4 100644
--- a/mm/swap.h
+++ b/mm/swap.h
@@ -58,6 +58,13 @@ struct swap_io_ctx {
struct swap_info_struct *sis;
};
+struct swap_ops {
+ bool (*can_merge)(struct folio *folio, struct folio *prev_folio,
+ size_t prev_folio_size);
+ void (*submit_write)(struct swap_io_ctx *ctx);
+ void (*submit_read)(struct swap_io_ctx *ctx);
+};
+
#ifdef CONFIG_SWAP
#include <linux/swapops.h> /* for swp_offset */
#include <linux/blk_types.h> /* for bio_end_io_t */
@@ -495,6 +502,9 @@ static inline int non_swapcache_batch(swp_entry_t entry, int max_nr)
}
#endif /* CONFIG_SWAP */
+extern const struct swap_ops swap_bdev_ops;
+extern const struct swap_ops swap_fs_ops;
+
int shmem_writeout(struct swap_io_ctx *ctx, struct folio *folio,
struct list_head *folio_list);
diff --git a/mm/swapfile.c b/mm/swapfile.c
index 27dbce0d1e1e..fce69a91e7b4 100644
--- a/mm/swapfile.c
+++ b/mm/swapfile.c
@@ -2785,6 +2785,8 @@ static int setup_swap_extents(struct swap_info_struct *sis,
if (ret)
return ret;
+ sis->ops = &swap_bdev_ops;
+
if (S_ISBLK(inode->i_mode)) {
ret = add_swap_extent(sis, 0, sis->max, 0);
*span = sis->pages;
@@ -2795,6 +2797,8 @@ static int setup_swap_extents(struct swap_info_struct *sis,
ret = mapping->a_ops->swap_activate(sis, swap_file, span);
if (ret < 0)
return ret;
+ if (sis->flags & SWP_FS_OPS)
+ sis->ops = &swap_fs_ops;
sis->flags |= SWP_ACTIVATED;
return ret;
}
--
2.53.0
^ permalink raw reply related [flat|nested] 7+ messages in thread* [PATCH 6/6] mm/swap: remove SWP_FS_OPS
2026-05-15 12:00 RFC: better block swap batching and a different take on swap_ops Christoph Hellwig
` (4 preceding siblings ...)
2026-05-15 12:00 ` [PATCH 5/6] mm/swap: use swap_ops to register swap device's methods Christoph Hellwig
@ 2026-05-15 12:00 ` Christoph Hellwig
5 siblings, 0 replies; 7+ messages in thread
From: Christoph Hellwig @ 2026-05-15 12:00 UTC (permalink / raw)
Cc: baoquan.he, akpm, chrisl, usama.arif, kasong, nphamcs, shikemeng,
youngjun.park, linux-mm
Provide a swap_fs_activate helper that directly sets up swap_fs_ops,
and a flag in struct swap_ops to indicate of NOFS swapping is allowed.
Signed-off-by: Christoph Hellwig <hch@lst.de>
---
Documentation/filesystems/locking.rst | 5 +++--
Documentation/filesystems/vfs.rst | 4 ++--
fs/nfs/file.c | 4 +---
fs/smb/client/file.c | 4 +---
include/linux/swap.h | 6 +++++-
mm/page_io.c | 9 ++++++++-
mm/swap.h | 5 ++++-
mm/swapfile.c | 2 --
mm/vmscan.c | 14 ++++++--------
9 files changed, 30 insertions(+), 23 deletions(-)
diff --git a/Documentation/filesystems/locking.rst b/Documentation/filesystems/locking.rst
index 8421ea21bd35..70481bdc031d 100644
--- a/Documentation/filesystems/locking.rst
+++ b/Documentation/filesystems/locking.rst
@@ -355,13 +355,14 @@ should perform any validation and preparation necessary to ensure that
writes can be performed with minimal memory allocation. It should call
add_swap_extent(), or the helper iomap_swapfile_activate(), and return
the number of extents added. If IO should be submitted through
-->swap_rw(), it should set SWP_FS_OPS, otherwise IO will be submitted
+->swap_rw(), it should call swap_fs_activate, otherwise IO will be submitted
directly to the block device ``sis->bdev``.
->swap_deactivate() will be called in the sys_swapoff()
path after ->swap_activate() returned success.
-->swap_rw will be called for swap IO if SWP_FS_OPS was set by ->swap_activate().
+->swap_rw will be called for swap IO if swap_fs_activate was called by
+->swap_activate().
file_lock_operations
====================
diff --git a/Documentation/filesystems/vfs.rst b/Documentation/filesystems/vfs.rst
index 7c753148af88..e7677423a20f 100644
--- a/Documentation/filesystems/vfs.rst
+++ b/Documentation/filesystems/vfs.rst
@@ -977,7 +977,7 @@ cache in your filesystem. The following members are defined:
can be performed with minimal memory allocation. It should call
add_swap_extent(), or the helper iomap_swapfile_activate(), and
return the number of extents added. If IO should be submitted
- through ->swap_rw(), it should set SWP_FS_OPS, otherwise IO will
+ through ->swap_rw(), it should call swap_fs_activate, otherwise IO will
be submitted directly to the block device ``sis->bdev``.
``swap_deactivate``
@@ -985,7 +985,7 @@ cache in your filesystem. The following members are defined:
successful.
``swap_rw``
- Called to read or write swap pages when SWP_FS_OPS is set.
+ Called to read or write swap pages when swap_fs_activate was called.
The File Object
===============
diff --git a/fs/nfs/file.c b/fs/nfs/file.c
index 25048a3c2364..8172c9972b46 100644
--- a/fs/nfs/file.c
+++ b/fs/nfs/file.c
@@ -589,7 +589,7 @@ static int nfs_swap_activate(struct swap_info_struct *sis, struct file *file,
ret = rpc_clnt_swap_activate(clnt);
if (ret)
return ret;
- ret = add_swap_extent(sis, 0, sis->max, 0);
+ ret = swap_fs_activate(sis);
if (ret < 0) {
rpc_clnt_swap_deactivate(clnt);
return ret;
@@ -599,8 +599,6 @@ static int nfs_swap_activate(struct swap_info_struct *sis, struct file *file,
if (cl->rpc_ops->enable_swap)
cl->rpc_ops->enable_swap(inode);
-
- sis->flags |= SWP_FS_OPS;
return ret;
}
diff --git a/fs/smb/client/file.c b/fs/smb/client/file.c
index 664a2c223089..74c2748484ff 100644
--- a/fs/smb/client/file.c
+++ b/fs/smb/client/file.c
@@ -3327,9 +3327,7 @@ static int cifs_swap_activate(struct swap_info_struct *sis,
* but we could add call to grab a byte range lock to prevent others
* from reading or writing the file
*/
-
- sis->flags |= SWP_FS_OPS;
- return add_swap_extent(sis, 0, sis->max, 0);
+ return swap_fs_activate(sis);
}
static void cifs_swap_deactivate(struct file *file)
diff --git a/include/linux/swap.h b/include/linux/swap.h
index 0da33b803348..15790544ca3e 100644
--- a/include/linux/swap.h
+++ b/include/linux/swap.h
@@ -208,7 +208,6 @@ enum {
SWP_SOLIDSTATE = (1 << 4), /* blkdev seeks are cheap */
SWP_BLKDEV = (1 << 6), /* its a block device */
SWP_ACTIVATED = (1 << 7), /* set after swap_activate success */
- SWP_FS_OPS = (1 << 8), /* swapfile operations go through fs */
SWP_AREA_DISCARD = (1 << 9), /* single-time swap area discards */
SWP_PAGE_DISCARD = (1 << 10), /* freed swap page-cluster discards */
SWP_STABLE_WRITES = (1 << 11), /* no overwrite PG_writeback pages */
@@ -404,6 +403,7 @@ extern void __meminit kswapd_stop(int nid);
#ifdef CONFIG_SWAP
+int swap_fs_activate(struct swap_info_struct *sis);
int add_swap_extent(struct swap_info_struct *sis, unsigned long start_page,
unsigned long nr_pages, sector_t start_block);
int generic_swapfile_activate(struct swap_info_struct *, struct file *,
@@ -528,6 +528,10 @@ static inline bool folio_free_swap(struct folio *folio)
return false;
}
+static inline int swap_fs_activate(struct swap_info_struct *sis)
+{
+ return -EINVAL;
+}
static inline int add_swap_extent(struct swap_info_struct *sis,
unsigned long start_page,
unsigned long nr_pages, sector_t start_block)
diff --git a/mm/page_io.c b/mm/page_io.c
index 4678a8af9f96..46eed28ee261 100644
--- a/mm/page_io.c
+++ b/mm/page_io.c
@@ -625,12 +625,19 @@ static bool swap_fs_can_merge(struct folio *folio, struct folio *prev_folio,
swap_dev_pos(prev_folio->swap) + prev_folio_size;
}
-const struct swap_ops swap_fs_ops = {
+static const struct swap_ops swap_fs_ops = {
+ .flags = SWAP_OPS_F_NOFS,
.submit_write = swap_fs_submit_write,
.submit_read = swap_fs_submit_read,
.can_merge = swap_fs_can_merge,
};
+int swap_fs_activate(struct swap_info_struct *sis)
+{
+ sis->ops = &swap_fs_ops;
+ return add_swap_extent(sis, 0, sis->max, 0);
+}
+
void swap_write_submit(struct swap_io_ctx *ctx)
{
if (!ctx->sio)
diff --git a/mm/swap.h b/mm/swap.h
index aaf774fd03b4..b70dd4178baa 100644
--- a/mm/swap.h
+++ b/mm/swap.h
@@ -58,7 +58,11 @@ struct swap_io_ctx {
struct swap_info_struct *sis;
};
+#define SWAP_OPS_F_NOFS (1U << 0)
+
struct swap_ops {
+ unsigned int flags;
+
bool (*can_merge)(struct folio *folio, struct folio *prev_folio,
size_t prev_folio_size);
void (*submit_write)(struct swap_io_ctx *ctx);
@@ -503,7 +507,6 @@ static inline int non_swapcache_batch(swp_entry_t entry, int max_nr)
#endif /* CONFIG_SWAP */
extern const struct swap_ops swap_bdev_ops;
-extern const struct swap_ops swap_fs_ops;
int shmem_writeout(struct swap_io_ctx *ctx, struct folio *folio,
struct list_head *folio_list);
diff --git a/mm/swapfile.c b/mm/swapfile.c
index fce69a91e7b4..7b44caf6a0e8 100644
--- a/mm/swapfile.c
+++ b/mm/swapfile.c
@@ -2797,8 +2797,6 @@ static int setup_swap_extents(struct swap_info_struct *sis,
ret = mapping->a_ops->swap_activate(sis, swap_file, span);
if (ret < 0)
return ret;
- if (sis->flags & SWP_FS_OPS)
- sis->ops = &swap_fs_ops;
sis->flags |= SWP_ACTIVATED;
return ret;
}
diff --git a/mm/vmscan.c b/mm/vmscan.c
index 56cd59e27447..d0bc145098e0 100644
--- a/mm/vmscan.c
+++ b/mm/vmscan.c
@@ -1035,16 +1035,14 @@ static bool may_enter_fs(struct folio *folio, gfp_t gfp_mask)
{
if (gfp_mask & __GFP_FS)
return true;
- if (!folio_test_swapcache(folio) || !(gfp_mask & __GFP_IO))
- return false;
/*
- * We can "enter_fs" for swap-cache with only __GFP_IO
- * providing this isn't SWP_FS_OPS.
- * ->flags can be updated non-atomically,
- * but that will never affect SWP_FS_OPS, so the data_race
- * is safe.
+ * We can "enter_fs" for swap-cache with only __GFP_IO unless backed by
+ * a swapfile that requires GFP_NOFS I/O.
*/
- return !data_race(folio_swap_flags(folio) & SWP_FS_OPS);
+ if (folio_test_swapcache(folio) && (gfp_mask & __GFP_IO) &&
+ !(__swap_entry_to_info(folio->swap)->ops->flags & SWAP_OPS_F_NOFS))
+ return true;
+ return false;
}
/*
--
2.53.0
^ permalink raw reply related [flat|nested] 7+ messages in thread