All of lore.kernel.org
 help / color / mirror / Atom feed
From: Baoquan He <bhe@redhat.com>
To: Usama Arif <usamaarif642@gmail.com>
Cc: linux-mm@kvack.org, akpm@linux-foundation.org, chrisl@kernel.org,
	kasong@tencent.com, shikemeng@huaweicloud.com, nphamcs@gmail.com,
	baohua@kernel.org, youngjun.park@lge.com
Subject: Re: [PATCH 2/3] mm/swap: use swap_ops to register swap device's methods
Date: Tue, 3 Mar 2026 18:41:32 +0800	[thread overview]
Message-ID: <aaa63OdTm-JFVa1J@MiWiFi-R3L-srv> (raw)
In-Reply-To: <20260302145307.320941-1-usamaarif642@gmail.com>

On 03/02/26 at 06:53am, Usama Arif wrote:
> On Mon,  2 Mar 2026 18:40:15 +0800 Baoquan He <bhe@redhat.com> wrote:
> 
> > This simplifies codes and makes logic clearer. And also makes later any
> > new swap device type being added easier to handle.
> > 
> > Currently there are three types of swap devices: bdev_fs, bdev_sync
> > and bdev_async, and only operations read_folio and write_folio are
> > included. In the future, there could be more swap device types added
> > and more appropriate opeations adapted into swap_ops.
> > 
> > Signed-off-by: Baoquan He <bhe@redhat.com>
> > ---
> >  include/linux/swap.h |  13 ++++++
> >  mm/swap.h            |   1 -
> >  mm/swap_io.c         | 102 +++++++++++++++++++++++++------------------
> >  mm/swapfile.c        |   2 +
> >  mm/zswap.c           |   3 +-
> >  5 files changed, 76 insertions(+), 45 deletions(-)
> > 
> > diff --git a/include/linux/swap.h b/include/linux/swap.h
> > index 0effe3cc50f5..448e5e66ec5c 100644
> > --- a/include/linux/swap.h
> > +++ b/include/linux/swap.h
> > @@ -19,6 +19,7 @@
> >  struct notifier_block;
> >  
> >  struct bio;
> > +struct swap_iocb;
> >  
> >  struct pagevec;
> >  
> > @@ -222,6 +223,17 @@ enum {
> >  #define SWAP_CLUSTER_MAX_SKIPPED (SWAP_CLUSTER_MAX << 10)
> >  #define COMPACT_CLUSTER_MAX SWAP_CLUSTER_MAX
> >  
> > +struct swap_ops {
> > +	void (*read_folio)(struct swap_info_struct *sis,
> > +			   struct folio *folio,
> > +			   struct swap_iocb **plug);
> > +	void (*write_folio)(struct swap_info_struct *sis,
> > +			    struct folio *folio,
> > +			    struct swap_iocb **plug);
> > +};
> > +
> > +int probe_swap_fs(struct swap_info_struct *sis);
> > +
> 
> Would it be better to put these in mm/swap.h as they are only used in mm/?

Right, other reviewers also pointed this out. Will change in v2.

> 
> >  /*
> >   * The first page in the swap file is the swap header, which is always marked
> >   * bad to prevent it from being allocated as an entry. This also prevents the
> > @@ -284,6 +296,7 @@ struct swap_info_struct {
> >  	struct work_struct reclaim_work; /* reclaim worker */
> >  	struct list_head discard_clusters; /* discard clusters list */
> >  	struct plist_node avail_list;   /* entry in swap_avail_head */
> > +	struct swap_ops *ops;
> >  };
> >  
> >  static inline swp_entry_t page_swap_entry(struct page *page)
> > diff --git a/mm/swap.h b/mm/swap.h
> > index 161185057993..c390df3f5889 100644
> > --- a/mm/swap.h
> > +++ b/mm/swap.h
> > @@ -226,7 +226,6 @@ static inline void swap_read_unplug(struct swap_iocb *plug)
> >  }
> >  void swap_write_unplug(struct swap_iocb *sio);
> >  int swap_writeout(struct folio *folio, struct swap_iocb **swap_plug);
> > -void __swap_writepage(struct folio *folio, struct swap_iocb **swap_plug);
> >  
> >  /* linux/mm/swap_state.c */
> >  extern struct address_space swap_space __read_mostly;
> > diff --git a/mm/swap_io.c b/mm/swap_io.c
> > index d1cdb10ba133..47077b345ae3 100644
> > --- a/mm/swap_io.c
> > +++ b/mm/swap_io.c
> > @@ -240,6 +240,7 @@ static void swap_zeromap_folio_clear(struct folio *folio)
> >  int swap_writeout(struct folio *folio, struct swap_iocb **swap_plug)
> >  {
> >  	int ret = 0;
> > +	struct swap_info_struct *sis = __swap_entry_to_info(folio->swap);
> >  
> >  	if (folio_free_swap(folio))
> >  		goto out_unlock;
> > @@ -281,7 +282,8 @@ int swap_writeout(struct folio *folio, struct swap_iocb **swap_plug)
> >  		return AOP_WRITEPAGE_ACTIVATE;
> >  	}
> >  
> > -	__swap_writepage(folio, swap_plug);
> > +	if (sis->ops && sis->ops->write_folio)
> > +		sis->ops->write_folio(sis, folio, swap_plug);
> 
> The old __swap_writepage() always dispatched to one of the three write
> functions unconditionally. If the guard condition is false (ops is NULL),
> swap_writeout() returns 0 (success) but the folio is never unlocked --
> the write functions are the ones that call folio_unlock(). Would this
> leave the folio locked and lead to a deadlock? Similar issue in swap_read_folio.

Hmm, for now NULL sis->ops won't happen. But we could have it in the
future, means there could be a swap device w/o read/write_folio methods.

> 
> >  	return 0;
> >  out_unlock:
> >  	folio_unlock(folio);
> > @@ -371,10 +373,11 @@ static void sio_write_complete(struct kiocb *iocb, long ret)
> >  	mempool_free(sio, sio_pool);
> >  }
> >  
> > -static void swap_writepage_fs(struct folio *folio, struct swap_iocb **swap_plug)
> > +static void swap_writepage_fs(struct swap_info_struct *sis,
> > +			      struct folio *folio,
> > +			      struct swap_iocb **swap_plug)
> >  {
> >  	struct swap_iocb *sio = swap_plug ? *swap_plug : NULL;
> > -	struct swap_info_struct *sis = __swap_entry_to_info(folio->swap);
> >  	struct file *swap_file = sis->swap_file;
> >  	loff_t pos = swap_dev_pos(folio->swap);
> >  
> > @@ -407,8 +410,9 @@ static void swap_writepage_fs(struct folio *folio, struct swap_iocb **swap_plug)
> >  		*swap_plug = sio;
> >  }
> >  
> > -static void swap_writepage_bdev_sync(struct folio *folio,
> > -		struct swap_info_struct *sis)
> > +static void swap_writepage_bdev_sync(struct swap_info_struct *sis,
> > +				     struct folio *folio,
> > +				     struct swap_iocb **plug)
> >  {
> >  	struct bio_vec bv;
> >  	struct bio bio;
> > @@ -427,8 +431,9 @@ static void swap_writepage_bdev_sync(struct folio *folio,
> >  	__end_swap_bio_write(&bio);
> >  }
> >  
> > -static void swap_writepage_bdev_async(struct folio *folio,
> > -		struct swap_info_struct *sis)
> > +static void swap_writepage_bdev_async(struct swap_info_struct *sis,
> > +				      struct folio *folio,
> > +				      struct swap_iocb **plug)
> >  {
> >  	struct bio *bio;
> >  
> > @@ -444,29 +449,6 @@ static void swap_writepage_bdev_async(struct folio *folio,
> >  	submit_bio(bio);
> >  }
> >  
> > -void __swap_writepage(struct folio *folio, struct swap_iocb **swap_plug)
> > -{
> > -	struct swap_info_struct *sis = __swap_entry_to_info(folio->swap);
> > -
> > -	VM_BUG_ON_FOLIO(!folio_test_swapcache(folio), folio);
> > -	/*
> > -	 * ->flags can be updated non-atomically (scan_swap_map_slots),
> > -	 * but that will never affect SWP_FS_OPS, so the data_race
> > -	 * is safe.
> > -	 */
> > -	if (data_race(sis->flags & SWP_FS_OPS))
> > -		swap_writepage_fs(folio, swap_plug);
> > -	/*
> > -	 * ->flags can be updated non-atomically (scan_swap_map_slots),
> > -	 * but that will never affect SWP_SYNCHRONOUS_IO, so the data_race
> > -	 * is safe.
> > -	 */
> > -	else if (data_race(sis->flags & SWP_SYNCHRONOUS_IO))
> > -		swap_writepage_bdev_sync(folio, sis);
> > -	else
> > -		swap_writepage_bdev_async(folio, sis);
> > -}
> > -
> >  void swap_write_unplug(struct swap_iocb *sio)
> >  {
> >  	struct iov_iter from;
> > @@ -535,9 +517,10 @@ static bool swap_read_folio_zeromap(struct folio *folio)
> >  	return true;
> >  }
> >  
> > -static void swap_read_folio_fs(struct folio *folio, struct swap_iocb **plug)
> > +static void swap_read_folio_fs(struct swap_info_struct *sis,
> > +			       struct folio *folio,
> > +			       struct swap_iocb **plug)
> >  {
> > -	struct swap_info_struct *sis = __swap_entry_to_info(folio->swap);
> >  	struct swap_iocb *sio = NULL;
> >  	loff_t pos = swap_dev_pos(folio->swap);
> >  
> > @@ -569,8 +552,9 @@ static void swap_read_folio_fs(struct folio *folio, struct swap_iocb **plug)
> >  		*plug = sio;
> >  }
> >  
> > -static void swap_read_folio_bdev_sync(struct folio *folio,
> > -		struct swap_info_struct *sis)
> > +static void swap_read_folio_bdev_sync(struct swap_info_struct *sis,
> > +				      struct folio *folio,
> > +				      struct swap_iocb **plug)
> >  {
> >  	struct bio_vec bv;
> >  	struct bio bio;
> > @@ -591,8 +575,9 @@ static void swap_read_folio_bdev_sync(struct folio *folio,
> >  	put_task_struct(current);
> >  }
> >  
> > -static void swap_read_folio_bdev_async(struct folio *folio,
> > -		struct swap_info_struct *sis)
> > +static void swap_read_folio_bdev_async(struct swap_info_struct *sis,
> > +				       struct folio *folio,
> > +				       struct swap_iocb **plug)
> >  {
> >  	struct bio *bio;
> >  
> > @@ -606,6 +591,42 @@ static void swap_read_folio_bdev_async(struct folio *folio,
> >  	submit_bio(bio);
> >  }
> >  
> > +static struct swap_ops bdev_fs_swap_ops = {
> > +	.read_folio = swap_read_folio_fs,
> > +	.write_folio = swap_writepage_fs,
> > +};
> > +
> > +static struct swap_ops bdev_sync_swap_ops = {
> > +	.read_folio = swap_read_folio_bdev_sync,
> > +	.write_folio = swap_writepage_bdev_sync,
> > +};
> > +
> > +static struct swap_ops bdev_async_swap_ops = {
> > +	.read_folio = swap_read_folio_bdev_async,
> > +	.write_folio = swap_writepage_bdev_async,
> > +};
> > +
> 
> Should we have all of these as static const struct swap_ops?

You are right, I will fix them in v2.

> 
> > +int probe_swap_fs(struct swap_info_struct *sis)
> > +{
> > +	/*
> > +	 * ->flags can be updated non-atomically (scan_swap_map_slots),
> > +	 * but that will never affect SWP_FS_OPS, so the data_race
> > +	 * is safe.
> > +	 */
> > +	if (data_race(sis->flags & SWP_FS_OPS))
> > +		sis->ops = &bdev_fs_swap_ops;
> > +	/*
> > +	 * ->flags can be updated non-atomically (scan_swap_map_slots),
> > +	 * but that will never affect SWP_SYNCHRONOUS_IO, so the data_race
> > +	 * is safe.
> > +	 */
> > +	else if (data_race(sis->flags & SWP_SYNCHRONOUS_IO))
> > +		sis->ops = &bdev_sync_swap_ops;
> > +	else
> > +		sis->ops = &bdev_async_swap_ops;
> > +	return 0;
> 
> The return is always 0, so this function could be void.

Yes, I will change.

> 
> > +}
> > +
> >  void swap_read_folio(struct folio *folio, struct swap_iocb **plug)
> >  {
> >  	struct swap_info_struct *sis = __swap_entry_to_info(folio->swap);
> > @@ -640,13 +661,8 @@ void swap_read_folio(struct folio *folio, struct swap_iocb **plug)
> >  	/* We have to read from slower devices. Increase zswap protection. */
> >  	zswap_folio_swapin(folio);
> >  
> > -	if (data_race(sis->flags & SWP_FS_OPS)) {
> > -		swap_read_folio_fs(folio, plug);
> > -	} else if (synchronous) {
> > -		swap_read_folio_bdev_sync(folio, sis);
> > -	} else {
> > -		swap_read_folio_bdev_async(folio, sis);
> > -	}
> > +	if (sis->ops && sis->ops->read_folio)
> > +		sis->ops->read_folio(sis, folio, plug);
> >  
> >  finish:
> >  	if (workingset) {
> > diff --git a/mm/swapfile.c b/mm/swapfile.c
> > index 915bc93964db..af498f9af328 100644
> > --- a/mm/swapfile.c
> > +++ b/mm/swapfile.c
> > @@ -3625,6 +3625,8 @@ SYSCALL_DEFINE2(swapon, const char __user *, specialfile, int, swap_flags)
> >  	/* Sets SWP_WRITEOK, resurrect the percpu ref, expose the swap device */
> >  	enable_swap_info(si);
> >  
> > +	probe_swap_fs(si);
> > +
> 
> Should probe_swap_fs() be called before enable_swap_info() rather than
> after it? enable_swap_info() sets SWP_WRITEOK and adds the device to
> swap_active_head, making it available for allocation. At that point
> si->ops is still NULL. If another CPU allocates swap from the new
> device and reclaim writes to it before probe_swap_fs() runs, the
> write will be silently dropped.

Good catch and I agree with you. I will change to call probe_swap_fs() 
before enable_swap_info().

> 
> >  	pr_info("Adding %uk swap on %s.  Priority:%d extents:%d across:%lluk %s%s%s%s\n",
> >  		K(si->pages), name->name, si->prio, nr_extents,
> >  		K((unsigned long long)span),
> > diff --git a/mm/zswap.c b/mm/zswap.c
> > index a399f7a10830..7ce906249c7a 100644
> > --- a/mm/zswap.c
> > +++ b/mm/zswap.c
> > @@ -1055,7 +1055,8 @@ static int zswap_writeback_entry(struct zswap_entry *entry,
> >  	folio_set_reclaim(folio);
> >  
> >  	/* start writeback */
> > -	__swap_writepage(folio, NULL);
> > +	if (si->ops && si->ops->write_folio)
> > +		si->ops->write_folio(si, folio, NULL);
> >  
> >  out:
> >  	if (ret && ret != -EEXIST) {
> > -- 
> > 2.52.0
> 



  reply	other threads:[~2026-03-03 10:41 UTC|newest]

Thread overview: 25+ messages / expand[flat|nested]  mbox.gz  Atom feed  top
2026-03-02 10:40 [PATCH 0/3] mm/swap: use swap_ops to register swap device's methods Baoquan He
2026-03-02 10:40 ` [PATCH 1/3] mm/swap: rename mm/page_io.c to mm/swap_io.c Baoquan He
2026-03-02 10:56   ` Barry Song
2026-03-02 13:25     ` Baoquan He
2026-03-02 21:12   ` Nhat Pham
2026-03-03  7:24     ` Baoquan He
2026-03-04  6:40   ` Kairui Song
2026-03-02 10:40 ` [PATCH 2/3] mm/swap: use swap_ops to register swap device's methods Baoquan He
2026-03-02 11:11   ` Barry Song
2026-03-02 14:47     ` Baoquan He
2026-03-02 19:28     ` Chris Li
2026-03-02 12:20   ` YoungJun Park
2026-03-02 14:09   ` YoungJun Park
2026-03-02 19:35     ` Chris Li
2026-03-03  7:14       ` Baoquan He
2026-03-02 14:53   ` Usama Arif
2026-03-03 10:41     ` Baoquan He [this message]
2026-03-02 21:21   ` Nhat Pham
2026-03-03  3:01     ` Baoquan He
2026-03-02 10:40 ` [PATCH 3/3] mm/swap_io.c: rename swap_writepage_* to swap_write_folio_* Baoquan He
2026-03-02 11:28   ` Barry Song
2026-03-02 21:11   ` Nhat Pham
2026-03-02 14:43 ` [PATCH 0/3] mm/swap: use swap_ops to register swap device's methods YoungJun Park
2026-03-05  5:18   ` Baoquan He
2026-03-05  8:09     ` YoungJun Park

Reply instructions:

You may reply publicly to this message via plain-text email
using any one of the following methods:

* Save the following mbox file, import it into your mail client,
  and reply-to-all from there: mbox

  Avoid top-posting and favor interleaved quoting:
  https://en.wikipedia.org/wiki/Posting_style#Interleaved_style

* Reply using the --to, --cc, and --in-reply-to
  switches of git-send-email(1):

  git send-email \
    --in-reply-to=aaa63OdTm-JFVa1J@MiWiFi-R3L-srv \
    --to=bhe@redhat.com \
    --cc=akpm@linux-foundation.org \
    --cc=baohua@kernel.org \
    --cc=chrisl@kernel.org \
    --cc=kasong@tencent.com \
    --cc=linux-mm@kvack.org \
    --cc=nphamcs@gmail.com \
    --cc=shikemeng@huaweicloud.com \
    --cc=usamaarif642@gmail.com \
    --cc=youngjun.park@lge.com \
    /path/to/YOUR_REPLY

  https://kernel.org/pub/software/scm/git/docs/git-send-email.html

* If your mail client supports setting the In-Reply-To header
  via mailto: links, try the mailto: link
Be sure your reply has a Subject: header at the top and a blank line before the message body.
This is an external index of several public inboxes,
see mirroring instructions on how to clone and mirror
all data and code used by this external index.