Re: [PATCH 4/8] mm/swap: also use struct swap_iocb for block I/O

All of lore.kernel.org
 help / color / mirror / Atom feed

From: Baoquan He <baoquan.he@linux.dev>
To: Christoph Hellwig <hch@lst.de>
Cc: akpm@linux-foundation.org, chrisl@kernel.org,
	usama.arif@linux.dev, kasong@tencent.com, nphamcs@gmail.com,
	shikemeng@huaweicloud.com, youngjun.park@lge.com,
	linux-mm@kvack.org
Subject: Re: [PATCH 4/8] mm/swap: also use struct swap_iocb for block I/O
Date: Thu, 4 Jun 2026 19:37:09 +0800	[thread overview]
Message-ID: <aiFjZXQ73awVxUFd@MiWiFi-R3L-srv> (raw)
In-Reply-To: <20260601113449.3464734-5-hch@lst.de>

On 06/01/26 at 01:34pm, Christoph Hellwig wrote:
> Block I/O benefits from batching just as much as remote file systems.
> Extent struct swap_iocb to support building a bio on the fly as well,
> and rewrite the block based swap code for it.  This especially benefits
> submit_bio based drivers that do not have the block plugging available,
> but also saves allocating extra bios for blk-mq drivers.
> 
> Note that the block based swap code now uses the same memcg-based
> check previously added for file system based swap as well.


I would add below words in log to ease patch reviewing/studying, while
it could be personal style.

What it does:
- Adds bio to a union with kiocb in struct swap_iocb, so the same sio
  can drive either FS or block I/O;
- Removes 6 functions (swap_writepage_bdev_sync, swap_writepage_bdev_async,
  swap_read_folio_bdev_sync, swap_read_folio_bdev_async, swap_writepage_fs,
  swap_read_folio_fs) and replaces them with a unified
  (swap_add_page + swap_can_merge + submit path);
- Adds sis pointer to swap_io_ctx for direct access in the submit path
- sio_pool_init() is now called unconditionally (not just for FS swap),
  which is necessary since bdev swap now also allocates from the sio pool;

swap_add_page() should be rename to swap_add_folio();

> 
> Signed-off-by: Christoph Hellwig <hch@lst.de>
> ---
>  mm/page_io.c  | 526 ++++++++++++++++++++++++--------------------------
>  mm/swap.h     |   1 +
>  mm/swapfile.c |   9 +-
>  3 files changed, 252 insertions(+), 284 deletions(-)
> 
> diff --git a/mm/page_io.c b/mm/page_io.c
> index 0bf035dc1170..22c751fe03c0 100644
> --- a/mm/page_io.c
> +++ b/mm/page_io.c
> @@ -28,54 +28,6 @@
>  #include "swap.h"
>  #include "swap_table.h"
>  
> -static void __end_swap_bio_write(struct bio *bio)
> -{
> -	struct folio *folio = bio_first_folio_all(bio);
> -
> -	if (bio->bi_status) {
> -		/*
> -		 * We failed to write the page out to swap-space.
> -		 * Re-dirty the page in order to avoid it being reclaimed.
> -		 * Also print a dire warning that things will go BAD (tm)
> -		 * very quickly.
> -		 *
> -		 * Also clear PG_reclaim to avoid folio_rotate_reclaimable()
> -		 */
> -		folio_mark_dirty(folio);
> -		pr_alert_ratelimited("Write-error on swap-device (%u:%u:%llu)\n",
> -				     MAJOR(bio_dev(bio)), MINOR(bio_dev(bio)),
> -				     (unsigned long long)bio->bi_iter.bi_sector);
> -		folio_clear_reclaim(folio);
> -	}
> -	folio_end_writeback(folio);
> -}
> -
> -static void end_swap_bio_write(struct bio *bio)
> -{
> -	__end_swap_bio_write(bio);
> -	bio_put(bio);
> -}
> -
> -static void __end_swap_bio_read(struct bio *bio)
> -{
> -	struct folio *folio = bio_first_folio_all(bio);
> -
> -	if (bio->bi_status) {
> -		pr_alert_ratelimited("Read-error on swap-device (%u:%u:%llu)\n",
> -				     MAJOR(bio_dev(bio)), MINOR(bio_dev(bio)),
> -				     (unsigned long long)bio->bi_iter.bi_sector);
> -	} else {
> -		folio_mark_uptodate(folio);
> -	}
> -	folio_unlock(folio);
> -}
> -
> -static void end_swap_bio_read(struct bio *bio)
> -{
> -	__end_swap_bio_read(bio);
> -	bio_put(bio);
> -}
> -
>  int generic_swapfile_activate(struct swap_info_struct *sis,
>  				struct file *swap_file,
>  				sector_t *span)
> @@ -316,26 +268,47 @@ static inline void count_swpout_vm_event(struct folio *folio)
>  }
>  
>  #if defined(CONFIG_MEMCG) && defined(CONFIG_BLK_CGROUP)
> -static void bio_associate_blkg_from_page(struct bio *bio, struct folio *folio)
> +static struct cgroup_subsys_state *folio_memcg_blkg_css(struct folio *folio)
> +{
> +	return cgroup_e_css(folio_memcg(folio)->css.cgroup, &io_cgrp_subsys);
> +}
> +
> +static bool folio_blkg_can_merge(struct folio *folio, struct folio *prev_folio)
>  {
> -	struct cgroup_subsys_state *css;
> -	struct mem_cgroup *memcg;
> +	if (!folio_memcg_charged(folio) || !folio_memcg_charged(prev_folio))
> +		return true;
> +
> +	rcu_read_lock();
> +	if (folio_memcg_blkg_css(folio) != folio_memcg_blkg_css(prev_folio)) {
> +		rcu_read_unlock();
> +		return false;
> +	}
> +	rcu_read_unlock();
> +
> +	return true;
> +}
>  
> +static void bio_associate_blkg_from_page(struct bio *bio, struct folio *folio)
> +{
>  	if (!folio_memcg_charged(folio))
>  		return;
> -
>  	rcu_read_lock();
> -	memcg = folio_memcg(folio);
> -	css = cgroup_e_css(memcg->css.cgroup, &io_cgrp_subsys);
> -	bio_associate_blkg_from_css(bio, css);
> +	bio_associate_blkg_from_css(bio, folio_memcg_blkg_css(folio));
>  	rcu_read_unlock();
>  }
>  #else
> +static bool folio_blkg_can_merge(struct folio *folio, struct folio *prev_folio)
> +{
> +	return true;
> +}
>  #define bio_associate_blkg_from_page(bio, folio)		do { } while (0)
>  #endif /* CONFIG_MEMCG && CONFIG_BLK_CGROUP */
>  
>  struct swap_iocb {
> -	struct kiocb		iocb;
> +	union {
> +		struct kiocb	iocb;
> +		struct bio	bio;
> +	};
>  	struct bio_vec		bvecs[SWAP_CLUSTER_MAX];
>  	int			nr_bvecs;
>  	int			len;
> @@ -355,171 +328,70 @@ int sio_pool_init(void)
>  	return 0;
>  }
>  
> -static void sio_write_complete(struct kiocb *iocb, long ret)
> +static bool swap_can_merge(struct swap_io_ctx *ctx, struct folio *folio,
> +		int rw)
>  {
> -	struct swap_iocb *sio = container_of(iocb, struct swap_iocb, iocb);
> -	struct page *page = sio->bvecs[0].bv_page;
> -	int p;
> +	struct swap_info_struct *sis = __swap_entry_to_info(folio->swap);
> +	struct bio_vec *last_bv = &ctx->sio->bvecs[ctx->sio->nr_bvecs - 1];
> +	struct folio *prev_folio = page_folio(last_bv->bv_page);
> +	size_t prev_folio_size = folio_size(prev_folio);
>  
> -	if (ret != sio->len) {
> -		/*
> -		 * In the case of swap-over-nfs, this can be a
> -		 * temporary failure if the system has limited
> -		 * memory for allocating transmit buffers.
> -		 * Mark the page dirty and avoid
> -		 * folio_rotate_reclaimable but rate-limit the
> -		 * messages.
> -		 */
> -		pr_err_ratelimited("Write error %ld on dio swapfile (%llu)\n",
> -				   ret, swap_dev_pos(page_swap_entry(page)));
> -		for (p = 0; p < sio->nr_bvecs; p++) {
> -			page = sio->bvecs[p].bv_page;
> -			set_page_dirty(page);
> -			ClearPageReclaim(page);
> -		}
> -	}
> +	if (ctx->sis != sis)
> +		return false;
>  
> -	for (p = 0; p < sio->nr_bvecs; p++)
> -		end_page_writeback(sio->bvecs[p].bv_page);
> +	if (sis->flags & SWP_FS_OPS) {
> +		if (swap_dev_pos(folio->swap) !=
> +		    swap_dev_pos(prev_folio->swap) + prev_folio_size)
> +			return false;
> +	} else {
> +		if (swap_folio_sector(folio) !=
> +		    swap_folio_sector(prev_folio) +
> +		    (prev_folio_size >> SECTOR_SHIFT))
> +			return false;
> +		if (rw == WRITE && !folio_blkg_can_merge(folio, prev_folio))
> +			return false;
> +	}
>  
> -	mempool_free(sio, sio_pool);
> +	return true;
>  }
>  
> -static void swap_writepage_fs(struct swap_io_ctx *ctx, struct folio *folio)
> +static void swap_add_page(struct swap_io_ctx *ctx, struct folio *folio, int rw)
>  {
> -	struct swap_iocb *sio = ctx->sio;
>  	struct swap_info_struct *sis = __swap_entry_to_info(folio->swap);
> -	struct file *swap_file = sis->swap_file;
> -	loff_t pos = swap_dev_pos(folio->swap);
> +	struct swap_iocb *sio = ctx->sio;
>  
> -	count_swpout_vm_event(folio);
> -	folio_start_writeback(folio);
> -	folio_unlock(folio);
> -	if (sio) {
> -		if (sio->iocb.ki_filp != swap_file ||
> -		    sio->iocb.ki_pos + sio->len != pos) {
> +	if (sio && !swap_can_merge(ctx, folio, rw)) {
> +		if (rw == WRITE)
>  			swap_write_submit(ctx);
> -			sio = NULL;
> -		}
> +		else
> +			swap_read_submit(ctx);
> +		sio = ctx->sio;
>  	}
> +
>  	if (!sio) {
> -		sio = mempool_alloc(sio_pool, GFP_NOIO);
> -		init_sync_kiocb(&sio->iocb, swap_file);
> -		sio->iocb.ki_complete = sio_write_complete;
> -		sio->iocb.ki_pos = pos;
> +		ctx->sis = sis;
> +		ctx->sio = sio = mempool_alloc(sio_pool, GFP_NOIO);
>  		sio->nr_bvecs = 0;
>  		sio->len = 0;
>  	}
>  	bvec_set_folio(&sio->bvecs[sio->nr_bvecs], folio, folio_size(folio), 0);
>  	sio->len += folio_size(folio);
> -	sio->nr_bvecs += 1;
> -	if (sio->nr_bvecs == ARRAY_SIZE(sio->bvecs)) {
> -		swap_write_submit(ctx);
> -		sio = NULL;
> +	if (++sio->nr_bvecs == ARRAY_SIZE(sio->bvecs)) {
> +		if (rw == WRITE)
> +			swap_write_submit(ctx);
> +		else
> +			swap_read_submit(ctx);
>  	}
> -	ctx->sio = sio;
> -}
> -
> -static void swap_writepage_bdev_sync(struct folio *folio,
> -		struct swap_info_struct *sis)
> -{
> -	struct bio_vec bv;
> -	struct bio bio;
> -
> -	bio_init(&bio, sis->bdev, &bv, 1, REQ_OP_WRITE | REQ_SWAP);
> -	bio.bi_iter.bi_sector = swap_folio_sector(folio);
> -	bio_add_folio_nofail(&bio, folio, folio_size(folio), 0);
> -
> -	bio_associate_blkg_from_page(&bio, folio);
> -	count_swpout_vm_event(folio);
> -
> -	folio_start_writeback(folio);
> -	folio_unlock(folio);
> -
> -	submit_bio_wait(&bio);
> -	__end_swap_bio_write(&bio);
>  }
>  
> -static void swap_writepage_bdev_async(struct folio *folio,
> -		struct swap_info_struct *sis)
> +void __swap_writepage(struct swap_io_ctx *ctx, struct folio *folio)
>  {
> -	struct bio *bio;
> -
> -	bio = bio_alloc(sis->bdev, 1, REQ_OP_WRITE | REQ_SWAP, GFP_NOIO);
> -	bio->bi_iter.bi_sector = swap_folio_sector(folio);
> -	bio->bi_end_io = end_swap_bio_write;
> -	bio_add_folio_nofail(bio, folio, folio_size(folio), 0);
> +	VM_BUG_ON_FOLIO(!folio_test_swapcache(folio), folio);
>  
> -	bio_associate_blkg_from_page(bio, folio);
>  	count_swpout_vm_event(folio);
>  	folio_start_writeback(folio);
>  	folio_unlock(folio);
> -	submit_bio(bio);
> -}
> -
> -void __swap_writepage(struct swap_io_ctx *ctx, struct folio *folio)
> -{
> -	struct swap_info_struct *sis = __swap_entry_to_info(folio->swap);
> -
> -	VM_BUG_ON_FOLIO(!folio_test_swapcache(folio), folio);
> -	/*
> -	 * ->flags can be updated non-atomically,
> -	 * but that will never affect SWP_FS_OPS, so the data_race
> -	 * is safe.
> -	 */
> -	if (data_race(sis->flags & SWP_FS_OPS))
> -		swap_writepage_fs(ctx, folio);
> -	/*
> -	 * ->flags can be updated non-atomically,
> -	 * but that will never affect SWP_SYNCHRONOUS_IO, so the data_race
> -	 * is safe.
> -	 */
> -	else if (data_race(sis->flags & SWP_SYNCHRONOUS_IO))
> -		swap_writepage_bdev_sync(folio, sis);
> -	else
> -		swap_writepage_bdev_async(folio, sis);
> -}
> -
> -void swap_write_submit(struct swap_io_ctx *ctx)
> -{
> -	struct swap_iocb *sio = ctx->sio;
> -	struct iov_iter from;
> -	int ret;
> -
> -	if (!sio)
> -		return;
> -
> -	iov_iter_bvec(&from, ITER_SOURCE, sio->bvecs, sio->nr_bvecs, sio->len);
> -	ret = sio->iocb.ki_filp->f_mapping->a_ops->swap_rw(&sio->iocb, &from);
> -	if (ret != -EIOCBQUEUED)
> -		sio_write_complete(&sio->iocb, ret);
> -	ctx->sio = NULL;
> -}
> -
> -static void sio_read_complete(struct kiocb *iocb, long ret)
> -{
> -	struct swap_iocb *sio = container_of(iocb, struct swap_iocb, iocb);
> -	int p;
> -
> -	if (ret == sio->len) {
> -		for (p = 0; p < sio->nr_bvecs; p++) {
> -			struct folio *folio = page_folio(sio->bvecs[p].bv_page);
> -
> -			count_mthp_stat(folio_order(folio), MTHP_STAT_SWPIN);
> -			count_memcg_folio_events(folio, PSWPIN, folio_nr_pages(folio));
> -			folio_mark_uptodate(folio);
> -			folio_unlock(folio);
> -		}
> -		count_vm_events(PSWPIN, sio->len >> PAGE_SHIFT);
> -	} else {
> -		for (p = 0; p < sio->nr_bvecs; p++) {
> -			struct folio *folio = page_folio(sio->bvecs[p].bv_page);
> -
> -			folio_unlock(folio);
> -		}
> -		pr_alert_ratelimited("Read-error on swap-device\n");
> -	}
> -	mempool_free(sio, sio_pool);
> +	swap_add_page(ctx, folio, WRITE);
>  }
>  
>  /*
> @@ -585,74 +457,6 @@ static bool swap_read_folio_zeromap(struct folio *folio)
>  	return true;
>  }
>  
> -static void swap_read_folio_fs(struct swap_io_ctx *ctx, struct folio *folio)
> -{
> -	struct swap_info_struct *sis = __swap_entry_to_info(folio->swap);
> -	struct swap_iocb *sio = ctx->sio;
> -	loff_t pos = swap_dev_pos(folio->swap);
> -
> -	if (sio) {
> -		if (sio->iocb.ki_filp != sis->swap_file ||
> -		    sio->iocb.ki_pos + sio->len != pos) {
> -			swap_read_submit(ctx);
> -			sio = NULL;
> -		}
> -	}
> -	if (!sio) {
> -		sio = mempool_alloc(sio_pool, GFP_KERNEL);
> -		init_sync_kiocb(&sio->iocb, sis->swap_file);
> -		sio->iocb.ki_pos = pos;
> -		sio->iocb.ki_complete = sio_read_complete;
> -		sio->nr_bvecs = 0;
> -		sio->len = 0;
> -	}
> -	bvec_set_folio(&sio->bvecs[sio->nr_bvecs], folio, folio_size(folio), 0);
> -	sio->len += folio_size(folio);
> -	sio->nr_bvecs += 1;
> -	if (sio->nr_bvecs == ARRAY_SIZE(sio->bvecs)) {
> -		swap_read_submit(ctx);
> -		sio = NULL;
> -	}
> -	ctx->sio = sio;
> -}
> -
> -static void swap_read_folio_bdev_sync(struct folio *folio,
> -		struct swap_info_struct *sis)
> -{
> -	struct bio_vec bv;
> -	struct bio bio;
> -
> -	bio_init(&bio, sis->bdev, &bv, 1, REQ_OP_READ);
> -	bio.bi_iter.bi_sector = swap_folio_sector(folio);
> -	bio_add_folio_nofail(&bio, folio, folio_size(folio), 0);
> -	/*
> -	 * Keep this task valid during swap readpage because the oom killer may
> -	 * attempt to access it in the page fault retry time check.
> -	 */
> -	get_task_struct(current);
> -	count_mthp_stat(folio_order(folio), MTHP_STAT_SWPIN);
> -	count_memcg_folio_events(folio, PSWPIN, folio_nr_pages(folio));
> -	count_vm_events(PSWPIN, folio_nr_pages(folio));
> -	submit_bio_wait(&bio);
> -	__end_swap_bio_read(&bio);
> -	put_task_struct(current);
> -}
> -
> -static void swap_read_folio_bdev_async(struct folio *folio,
> -		struct swap_info_struct *sis)
> -{
> -	struct bio *bio;
> -
> -	bio = bio_alloc(sis->bdev, 1, REQ_OP_READ, GFP_KERNEL);
> -	bio->bi_iter.bi_sector = swap_folio_sector(folio);
> -	bio->bi_end_io = end_swap_bio_read;
> -	bio_add_folio_nofail(bio, folio, folio_size(folio), 0);
> -	count_mthp_stat(folio_order(folio), MTHP_STAT_SWPIN);
> -	count_memcg_folio_events(folio, PSWPIN, folio_nr_pages(folio));
> -	count_vm_events(PSWPIN, folio_nr_pages(folio));
> -	submit_bio(bio);
> -}
> -
>  void swap_read_folio(struct swap_io_ctx *ctx, struct folio *folio)
>  {
>  	struct swap_info_struct *sis = __swap_entry_to_info(folio->swap);
> @@ -686,14 +490,7 @@ void swap_read_folio(struct swap_io_ctx *ctx, struct folio *folio)
>  
>  	/* We have to read from slower devices. Increase zswap protection. */
>  	zswap_folio_swapin(folio);
> -
> -	if (data_race(sis->flags & SWP_FS_OPS)) {
> -		swap_read_folio_fs(ctx, folio);
> -	} else if (synchronous) {
> -		swap_read_folio_bdev_sync(folio, sis);
> -	} else {
> -		swap_read_folio_bdev_async(folio, sis);
> -	}
> +	swap_add_page(ctx, folio, READ);
>  
>  finish:
>  	if (workingset) {
> @@ -703,18 +500,189 @@ void swap_read_folio(struct swap_io_ctx *ctx, struct folio *folio)
>  	delayacct_swapin_end();
>  }
>  
> -void swap_read_submit(struct swap_io_ctx *ctx)
> +static void swap_write_end(struct swap_iocb *sio, bool failed)
> +{
> +	int p;
> +
> +	for (p = 0; p < sio->nr_bvecs; p++) {
> +		struct page *page = sio->bvecs[p].bv_page;
> +
> +		if (failed) {
> +			set_page_dirty(page);
> +			ClearPageReclaim(page);
> +		}
> +		end_page_writeback(page);
> +	}
> +	mempool_free(sio, sio_pool);
> +}
> +
> +static void swap_fs_write_complete(struct kiocb *iocb, long ret)
> +{
> +	struct swap_iocb *sio = container_of(iocb, struct swap_iocb, iocb);
> +	bool failed = ret != sio->len;
> +
> +	if (failed) {
> +		struct page *page = sio->bvecs[0].bv_page;
> +
> +		/*
> +		 * In the case of swap-over-nfs, this can be a temporary failure
> +		 * if the system has limited memory for allocating transmit
> +		 * buffers.  Mark the page dirty and avoid
> +		 * folio_rotate_reclaimable but rate-limit the messages.
> +		 */
> +		pr_err_ratelimited("Write error %ld on dio swapfile (%llu)\n",
> +				   ret, swap_dev_pos(page_swap_entry(page)));
> +	}
> +
> +	swap_write_end(sio, failed);
> +}
> +
> +static void end_swap_bio_write(struct bio *bio)
> +{
> +	struct swap_iocb *sio = container_of(bio, struct swap_iocb, bio);
> +	bool failed = !!bio->bi_status;
> +
> +	if (failed)
> +		pr_alert_ratelimited("Write-error on swap-device (%u:%u:%llu)\n",
> +				     MAJOR(bio_dev(bio)), MINOR(bio_dev(bio)),
> +				     (unsigned long long)bio->bi_iter.bi_sector);
> +	bio_uninit(bio);
> +	swap_write_end(sio, failed);
> +}
> +
> +static void swap_read_end(struct swap_iocb *sio, bool failed)
> +{
> +	int p;
> +
> +	for (p = 0; p < sio->nr_bvecs; p++) {
> +		struct folio *folio = page_folio(sio->bvecs[p].bv_page);
> +
> +		if (!failed) {
> +			count_mthp_stat(folio_order(folio), MTHP_STAT_SWPIN);
> +			count_memcg_folio_events(folio, PSWPIN,
> +					folio_nr_pages(folio));
> +			folio_mark_uptodate(folio);
> +		}
> +		folio_unlock(folio);
> +	}
> +
> +	if (!failed)
> +		count_vm_events(PSWPIN, sio->len >> PAGE_SHIFT);
> +
> +	mempool_free(sio, sio_pool);
> +}
> +
> +static void swap_fs_read_complete(struct kiocb *iocb, long ret)
> +{
> +	struct swap_iocb *sio = container_of(iocb, struct swap_iocb, iocb);
> +	bool failed = ret != sio->len;
> +
> +	if (failed)
> +		pr_alert_ratelimited("Read-error on swap-device\n");
> +	swap_read_end(sio, failed);
> +}
> +
> +static void swap_bio_read_end_io(struct bio *bio)
> +{
> +	struct swap_iocb *sio = container_of(bio, struct swap_iocb, bio);
> +	bool failed = !!bio->bi_status;
> +
> +	if (failed)
> +		pr_alert_ratelimited("Read-error on swap-device (%u:%u:%llu)\n",
> +				     MAJOR(bio_dev(bio)), MINOR(bio_dev(bio)),
> +				     (unsigned long long)bio->bi_iter.bi_sector);
> +	bio_uninit(bio);
> +	swap_read_end(sio, failed);
> +}
> +
> +static void swap_bdev_submit_write(struct swap_io_ctx *ctx)
>  {
>  	struct swap_iocb *sio = ctx->sio;
> -	struct iov_iter from;
> +	struct bio *bio = &sio->bio;
> +
> +	bio_init(bio, ctx->sis->bdev, sio->bvecs, ARRAY_SIZE(sio->bvecs),
> +			REQ_OP_WRITE | REQ_SWAP);
> +	bio->bi_iter.bi_size = sio->len;
> +	bio->bi_iter.bi_sector = swap_folio_sector(bio_first_folio_all(bio));
> +	bio_associate_blkg_from_page(bio, bio_first_folio_all(bio));
> +
> +	if (ctx->sis->flags & SWP_SYNCHRONOUS_IO) {
> +		submit_bio_wait(bio);
> +		end_swap_bio_write(bio);
> +	} else {
> +		bio->bi_end_io = end_swap_bio_write;
> +		submit_bio(bio);
> +	}
> +}
> +
> +static void swap_bdev_submit_read(struct swap_io_ctx *ctx)
> +{
> +	struct swap_iocb *sio = ctx->sio;
> +	struct bio *bio = &sio->bio;
> +
> +	bio_init(bio, ctx->sis->bdev, sio->bvecs, ARRAY_SIZE(sio->bvecs),
> +			REQ_OP_READ);
> +	bio->bi_iter.bi_size = sio->len;
> +	bio->bi_iter.bi_sector = swap_folio_sector(bio_first_folio_all(bio));
> +
> +	if (ctx->sis->flags & SWP_SYNCHRONOUS_IO) {
> +		/*
> +		 * Keep this task valid during swap readpage because the oom
> +		 * killer may attempt to access it in the page fault retry
> +		 * time check.
> +		 */
> +		get_task_struct(current);
> +		submit_bio_wait(bio);
> +		swap_bio_read_end_io(bio);
> +		put_task_struct(current);
> +	} else {
> +		bio->bi_end_io = swap_bio_read_end_io;
> +		submit_bio(bio);
> +	}
> +}
> +
> +static void swap_fs_submit(struct swap_io_ctx *ctx, int rw)
> +{
> +	struct swap_iocb *sio = ctx->sio;
> +	struct iov_iter iter;
>  	int ret;
>  
> -	if (!sio)
> -		return;
> +	init_sync_kiocb(&sio->iocb, ctx->sis->swap_file);
> +	sio->iocb.ki_pos = swap_dev_pos(page_folio(sio->bvecs[0].bv_page)->swap);
> +	if (rw == WRITE)
> +		sio->iocb.ki_complete = swap_fs_write_complete;
> +	else
> +		sio->iocb.ki_complete = swap_fs_read_complete;
>  
> -	iov_iter_bvec(&from, ITER_DEST, sio->bvecs, sio->nr_bvecs, sio->len);
> -	ret = sio->iocb.ki_filp->f_mapping->a_ops->swap_rw(&sio->iocb, &from);
> +	iov_iter_bvec(&iter, rw == WRITE ? ITER_SOURCE : ITER_DEST,
> +			sio->bvecs, sio->nr_bvecs, sio->len);
> +	ret = sio->iocb.ki_filp->f_mapping->a_ops->swap_rw(&sio->iocb, &iter);
>  	if (ret != -EIOCBQUEUED)
> -		sio_read_complete(&sio->iocb, ret);
> +		sio->iocb.ki_complete(&sio->iocb, ret);
> +}
> +
> +void swap_write_submit(struct swap_io_ctx *ctx)
> +{
> +	if (!ctx->sio)
> +		return;
> +
> +	if (ctx->sis->flags & SWP_FS_OPS)
> +		swap_fs_submit(ctx, WRITE);
> +	else
> +		swap_bdev_submit_write(ctx);
> +	ctx->sio = NULL;
> +	ctx->sis = NULL;
> +}
> +
> +void swap_read_submit(struct swap_io_ctx *ctx)
> +{
> +	if (!ctx->sio)
> +		return;
> +
> +	if (ctx->sis->flags & SWP_FS_OPS)
> +		swap_fs_submit(ctx, READ);
> +	else
> +		swap_bdev_submit_read(ctx);
>  	ctx->sio = NULL;
> +	ctx->sis = NULL;
>  }
> diff --git a/mm/swap.h b/mm/swap.h
> index 79d66272dfd4..b6ba80c2afb0 100644
> --- a/mm/swap.h
> +++ b/mm/swap.h
> @@ -81,6 +81,7 @@ enum swap_cluster_flags {
>  
>  struct swap_io_ctx {
>  	struct swap_iocb	*sio;
> +	struct swap_info_struct	*sis;
>  };
>  
>  #ifdef CONFIG_SWAP
> diff --git a/mm/swapfile.c b/mm/swapfile.c
> index 615d90867111..2372f7cc4653 100644
> --- a/mm/swapfile.c
> +++ b/mm/swapfile.c
> @@ -2842,6 +2842,10 @@ static int setup_swap_extents(struct swap_info_struct *sis,
>  	struct inode *inode = mapping->host;
>  	int ret;
>  
> +	ret = sio_pool_init();
> +	if (ret)
> +		return ret;
> +
>  	if (S_ISBLK(inode->i_mode)) {
>  		ret = add_swap_extent(sis, 0, sis->max, 0);
>  		*span = sis->pages;
> @@ -2853,11 +2857,6 @@ static int setup_swap_extents(struct swap_info_struct *sis,
>  		if (ret < 0)
>  			return ret;
>  		sis->flags |= SWP_ACTIVATED;
> -		if ((sis->flags & SWP_FS_OPS) &&
> -		    sio_pool_init() != 0) {
> -			destroy_swap_extents(sis, swap_file);
> -			return -ENOMEM;
> -		}
>  		return ret;
>  	}
>  
> -- 
> 2.53.0
>

next prev parent reply	other threads:[~2026-06-04 11:37 UTC|newest]

Thread overview: 28+ messages / expand[flat|nested]  mbox.gz  Atom feed  top
2026-06-01 11:34 better block swap batching and a different take on swap_ops v2 Christoph Hellwig
2026-06-01 11:34 ` [PATCH 1/8] shmem: provide a shmem_write_folio wrapper Christoph Hellwig
2026-06-04  9:43   ` Baoquan He
2026-06-05 17:07   ` Nhat Pham
2026-06-01 11:34 ` [PATCH 2/8] mm: merge writeout into pageout Christoph Hellwig
2026-06-04  9:44   ` Baoquan He
2026-06-05 17:07   ` Nhat Pham
2026-06-01 11:34 ` [PATCH 3/8] mm/swap: introduce struct swap_io_ctx Christoph Hellwig
2026-06-04 10:58   ` Baoquan He
2026-06-05 17:41   ` Nhat Pham
2026-06-01 11:34 ` [PATCH 4/8] mm/swap: also use struct swap_iocb for block I/O Christoph Hellwig
2026-06-04 10:59   ` Baoquan He
2026-06-04 11:37   ` Baoquan He [this message]
2026-06-01 11:34 ` [PATCH 5/8] mm/swap: remove count_swpout_vm_event Christoph Hellwig
2026-06-04 11:37   ` Baoquan He
2026-06-05 17:50   ` Nhat Pham
2026-06-01 11:34 ` [PATCH 6/8] mm/swap: use swap_ops to register swap device's methods Christoph Hellwig
2026-06-05 17:53   ` Nhat Pham
2026-06-01 11:34 ` [PATCH 7/8] mm/swap: remove SWP_FS_OPS Christoph Hellwig
2026-06-05  5:21   ` Baoquan He
2026-06-05 17:58   ` Nhat Pham
2026-06-01 11:34 ` [PATCH 8/8] mm/vmstat: add NRSWP{IN,OUT} counters Christoph Hellwig
2026-06-05  7:16   ` Baoquan He
2026-06-05 17:48   ` Nhat Pham
2026-06-01 13:29 ` better block swap batching and a different take on swap_ops v2 Baoquan He
2026-06-01 14:50   ` Christoph Hellwig
2026-06-01 15:17     ` Baoquan He
2026-06-01 15:25       ` Christoph Hellwig

Reply instructions:

You may reply publicly to this message via plain-text email
using any one of the following methods:

* Save the following mbox file, import it into your mail client,
  and reply-to-all from there: mbox

  Avoid top-posting and favor interleaved quoting:
  https://en.wikipedia.org/wiki/Posting_style#Interleaved_style

* Reply using the --to, --cc, and --in-reply-to
  switches of git-send-email(1):

  git send-email \
    --in-reply-to=aiFjZXQ73awVxUFd@MiWiFi-R3L-srv \
    --to=baoquan.he@linux.dev \
    --cc=akpm@linux-foundation.org \
    --cc=chrisl@kernel.org \
    --cc=hch@lst.de \
    --cc=kasong@tencent.com \
    --cc=linux-mm@kvack.org \
    --cc=nphamcs@gmail.com \
    --cc=shikemeng@huaweicloud.com \
    --cc=usama.arif@linux.dev \
    --cc=youngjun.park@lge.com \
    /path/to/YOUR_REPLY

  https://kernel.org/pub/software/scm/git/docs/git-send-email.html

* If your mail client supports setting the In-Reply-To header
  via mailto: links, try the mailto: link

Be sure your reply has a Subject: header at the top and a blank line before the message body.

This is an external index of several public inboxes,
see mirroring instructions on how to clone and mirror
all data and code used by this external index.