Re: [RFC PATCH v2 2/7] mm, swap: support zswap and zeroswap as vswap backends

Linux-mm Archive on lore.kernel.org
 help / color / mirror / Atom feed

From: Yosry Ahmed <yosry@kernel.org>
To: Nhat Pham <nphamcs@gmail.com>
Cc: akpm@linux-foundation.org, chrisl@kernel.org, kasong@tencent.com,
	 hannes@cmpxchg.org, mhocko@kernel.org, roman.gushchin@linux.dev,
	 shakeel.butt@linux.dev, david@kernel.org, muchun.song@linux.dev,
	 shikemeng@huaweicloud.com, baoquan.he@linux.dev,
	baohua@kernel.org, youngjun.park@lge.com,
	 chengming.zhou@linux.dev, ljs@kernel.org, liam@infradead.org,
	vbabka@kernel.org,  rppt@kernel.org, surenb@google.com,
	qi.zheng@linux.dev, axelrasmussen@google.com,
	 yuanchu@google.com, weixugc@google.com, riel@surriel.com,
	gourry@gourry.net,  haowenchao22@gmail.com, kernel-team@meta.com,
	linux-mm@kvack.org,  linux-kernel@vger.kernel.org,
	cgroups@vger.kernel.org
Subject: Re: [RFC PATCH v2 2/7] mm, swap: support zswap and zeroswap as vswap backends
Date: Tue, 23 Jun 2026 00:15:58 +0000	[thread overview]
Message-ID: <ajnNWRO7apBq2-kQ@google.com> (raw)
In-Reply-To: <20260612193738.2183968-3-nphamcs@gmail.com>

On Fri, Jun 12, 2026 at 12:37:33PM -0700, Nhat Pham wrote:
> Build the virtual swap layer on top of the swap-table infrastructure.
> Virtual swap entries decouple PTE swap entries from physical backing,
> allowing pages to be compressed by zswap (or detected as zero-filled)
> without pre-allocating a physical swap slot.
> 
> This patch only supports zswap and zero-page backends. If zswap_store
> fails, the page stays dirty in the swap cache (AOP_WRITEPAGE_ACTIVATE)
> - physical disk backing fallback comes in the next patch. Zswap
> writeback of vswap-backed entries is also disabled - the shrinker
> skips when no physical swap pages are available.
> 
> Suggested-by: Kairui Song <kasong@tencent.com>
> Signed-off-by: Nhat Pham <nphamcs@gmail.com>
[..]
> diff --git a/mm/zswap.c b/mm/zswap.c
> index 993406074d58..466f8a182716 100644
> --- a/mm/zswap.c
> +++ b/mm/zswap.c
> @@ -38,6 +38,7 @@
>  #include <linux/zsmalloc.h>
>  
>  #include "swap.h"
> +#include "vswap.h"
>  #include "internal.h"
>  
>  /*********************************
> @@ -762,7 +763,7 @@ static void zswap_entry_cache_free(struct zswap_entry *entry)
>   * Carries out the common pattern of freeing an entry's zsmalloc allocation,
>   * freeing the entry itself, and decrementing the number of stored pages.
>   */
> -static void zswap_entry_free(struct zswap_entry *entry)
> +void zswap_entry_free(struct zswap_entry *entry)
>  {
>  	zswap_lru_del(&zswap_list_lru, entry);
>  	zs_free(entry->pool->zs_pool, entry->handle);
> @@ -994,16 +995,21 @@ static int zswap_writeback_entry(struct zswap_entry *entry,
>  	struct swap_info_struct *si;
>  	int ret = 0;
>  
> +	/* try to allocate swap cache folio */
>  	si = get_swap_device(swpentry);
>  	if (!si)
>  		return -EEXIST;
>  
> +	/*
> +	 * Vswap entries have no physical backing - writeback would fail
> +	 * and SIGBUS the caller. Bail before we waste a swap-cache folio
> +	 * allocation.
> +	 */

Seems like this comment belongs in the previous patch, and the other
comment movement is undoing what last patch did.

>  	if (si->flags & SWP_VSWAP) {
>  		put_swap_device(si);
>  		return -EINVAL;
>  	}
>  
> -	/* try to allocate swap cache folio */
>  	mpol = get_task_policy(current);
>  	folio = swap_cache_alloc_folio(swpentry, GFP_KERNEL, BIT(0), NULL, mpol,
>  				       NO_INTERLEAVE_INDEX);
> @@ -1416,25 +1422,25 @@ static bool zswap_store_page(struct page *page,
>  	if (!zswap_compress(page, entry, pool))
>  		goto compress_failed;
>  
> -	old = xa_store(swap_zswap_tree(page_swpentry),
> -		       swp_offset(page_swpentry),
> -		       entry, GFP_KERNEL);
> -	if (xa_is_err(old)) {
> -		int err = xa_err(old);
> +	if (is_vswap_entry(page_swpentry)) {
> +		vswap_zswap_store(page_swpentry, entry);
> +	} else {
> +		old = xa_store(swap_zswap_tree(page_swpentry),
> +			       swp_offset(page_swpentry),
> +			       entry, GFP_KERNEL);
> +		if (xa_is_err(old)) {
> +			int err = xa_err(old);
> +
> +			WARN_ONCE(err != -ENOMEM,
> +				  "unexpected xarray error: %d\n", err);
> +			zswap_reject_alloc_fail++;
> +			goto store_failed;
> +		}
>  
> -		WARN_ONCE(err != -ENOMEM, "unexpected xarray error: %d\n", err);
> -		zswap_reject_alloc_fail++;
> -		goto store_failed;
> +		if (old)
> +			zswap_entry_free(old);
>  	}
>  
> -	/*
> -	 * We may have had an existing entry that became stale when
> -	 * the folio was redirtied and now the new version is being
> -	 * swapped out. Get rid of the old.
> -	 */
> -	if (old)
> -		zswap_entry_free(old);
> -
>  	/*
>  	 * The entry is successfully compressed and stored in the tree, there is
>  	 * no further possibility of failure. Grab refs to the pool and objcg,
> @@ -1487,6 +1493,7 @@ bool zswap_store(struct folio *folio)
>  	struct mem_cgroup *memcg = NULL;
>  	struct zswap_pool *pool;
>  	bool ret = false;
> +	bool partial_store = false;
>  	long index;
>  
>  	VM_WARN_ON_ONCE(!folio_test_locked(folio));
> @@ -1524,8 +1531,10 @@ bool zswap_store(struct folio *folio)
>  	for (index = 0; index < nr_pages; ++index) {
>  		struct page *page = folio_page(folio, index);
>  
> -		if (!zswap_store_page(page, objcg, pool))
> +		if (!zswap_store_page(page, objcg, pool)) {
> +			partial_store = index > 0;
>  			goto put_pool;
> +		}
>  	}
>  
>  	if (objcg)
> @@ -1548,7 +1557,9 @@ bool zswap_store(struct folio *folio)
>  	 * offsets corresponding to each page of the folio. Otherwise,
>  	 * writeback could overwrite the new data in the swapfile.
>  	 */
> -	if (!ret) {
> +	if (partial_store && is_vswap_entry(swp))
> +		folio_release_vswap_backing(folio);

Hmm the above should also only happen in the !ret case, but that's not
obvious from the code here. I think all of this should go under if
(!ret), but maybe reverse the polarity to avoid the indentation?

	if (ret)
		return ret;

	if (is_vswap_entry(swp)) {
		if (partial_store)
			folio_release_vswap_backing(folio);
		return ret;
	}

	...

Alternatively you can move the check_old code for xarray into a helper
and do:

	if (!ret) {
		if (is_vswap_entry(swp)) {
			if (partial_store)
				folio_release_vswap_backing(folio);
		} else {
			zswap_free_old_xa_entries(swp, nr_pages)
		}
	}

Also, I think you can probably drop partial_store and check the index
directly here.

> +	else if (!ret && !is_vswap_entry(swp)) {
>  		unsigned type = swp_type(swp);
>  		pgoff_t offset = swp_offset(swp);
>  		struct zswap_entry *entry;
> @@ -1588,8 +1599,7 @@ bool zswap_store(struct folio *folio)
>  int zswap_load(struct folio *folio)
>  {
>  	swp_entry_t swp = folio->swap;
> -	pgoff_t offset = swp_offset(swp);
> -	struct xarray *tree = swap_zswap_tree(swp);
> +	struct swap_info_struct *si = __swap_entry_to_info(swp);
>  	struct zswap_entry *entry;
>  
>  	VM_WARN_ON_ONCE(!folio_test_locked(folio));
> @@ -1599,16 +1609,25 @@ int zswap_load(struct folio *folio)
>  		return -ENOENT;
>  
>  	/*
> -	 * Large folios should not be swapped in while zswap is being used, as
> -	 * they are not properly handled. Zswap does not properly load large
> -	 * folios, and a large folio may only be partially in zswap.
> +	 * zswap_load() does not support large folios. For non-vswap
> +	 * entries this is unexpected on the swapin path: WARN and
> +	 * sigbus. For vswap entries __swap_cache_add_check() has already
> +	 * filtered out ZSWAP-backed THPs under the cluster lock, so the
> +	 * large folio here is zero- or phys-backed; return -ENOENT to
> +	 * fall through to the phys/zero IO path.

Hmm should we start simple and avoid THP swapin for vswap initially?

IIUC, it isn't really vswap specific. Even without vswap, it's possible
that an entire folio is on-disk, not in zswap, in which case THP swap
should be allowed.

I assume it's not common for zswap to be enabled and an entire THP worth
of pages are not in zswap, so maybe we can add this later?

>  	 */
> -	if (WARN_ON_ONCE(folio_test_large(folio))) {
> -		folio_unlock(folio);
> -		return -EINVAL;
> +	if (folio_test_large(folio)) {
> +		if (WARN_ON_ONCE(!swap_is_vswap(si))) {
> +			folio_unlock(folio);
> +			return -EINVAL;
> +		}
> +		return -ENOENT;
>  	}
>  
> -	entry = xa_load(tree, offset);
> +	if (swap_is_vswap(si))
> +		entry = vswap_zswap_load(swp);
> +	else
> +		entry = xa_load(swap_zswap_tree(swp), swp_offset(swp));
>  	if (!entry)
>  		return -ENOENT;
>  
> @@ -1623,16 +1642,14 @@ int zswap_load(struct folio *folio)
>  	if (entry->objcg)
>  		count_objcg_events(entry->objcg, ZSWPIN, 1);
>  
> -	/*
> -	 * We are reading into the swapcache, invalidate zswap entry.
> -	 * The swapcache is the authoritative owner of the page and
> -	 * its mappings, and the pressure that results from having two
> -	 * in-memory copies outweighs any benefits of caching the
> -	 * compression work.
> -	 */
>  	folio_mark_dirty(folio);
> -	xa_erase(tree, offset);
> -	zswap_entry_free(entry);
> +
> +	if (swap_is_vswap(si)) {
> +		folio_release_vswap_backing(folio);

Is there any advantage to calling folio_release_vswap_backing() over
zswap_entry_free()? Seems like __vswap_release_backing() ends up just
calling zswap_entry_free() -- and I don't see any vswap-specific state
being cleaned up.

I wonder if the zswap code should call zswap_entry_free() directly? Same
goes for the call in zswap_store() above.

> +	} else {
> +		xa_erase(swap_zswap_tree(swp), swp_offset(swp));
> +		zswap_entry_free(entry);
> +	}
>  
>  	folio_unlock(folio);
>  	return 0;
> -- 
> 2.53.0-Meta
>

next prev parent reply	other threads:[~2026-06-23  0:16 UTC|newest]

Thread overview: 14+ messages / expand[flat|nested]  mbox.gz  Atom feed  top
2026-06-12 19:37 [RFC PATCH v2 0/7] mm, swap: Virtual Swap Space (Swap Table Edition) Nhat Pham
2026-06-12 19:37 ` [RFC PATCH v2 1/7] mm, swap: add virtual swap device infrastructure Nhat Pham
2026-06-12 19:37 ` [RFC PATCH v2 2/7] mm, swap: support zswap and zeroswap as vswap backends Nhat Pham
2026-06-23  0:15   ` Yosry Ahmed [this message]
2026-06-23  0:18   ` Yosry Ahmed
2026-06-12 19:37 ` [RFC PATCH v2 3/7] mm, swap: support physical swap as a vswap backend Nhat Pham
2026-06-23  0:23   ` Yosry Ahmed
2026-06-12 19:37 ` [RFC PATCH v2 4/7] mm, swap: only charge physical swap entries Nhat Pham
2026-06-12 19:37 ` [RFC PATCH v2 5/7] mm, swap: add debugfs counters for vswap Nhat Pham
2026-06-12 19:37 ` [RFC PATCH v2 6/7] mm, swap: defer memcg_table allocation on physical clusters Nhat Pham
2026-06-12 19:37 ` [RFC PATCH v2 7/7] mm, swap: widen swap_info_struct max/pages to unsigned long Nhat Pham
2026-06-14  8:20 ` [RFC PATCH v2 0/7] mm, swap: Virtual Swap Space (Swap Table Edition) YoungJun Park
2026-06-15  2:38   ` Nhat Pham
     [not found]     ` <CAO9r8zPj5EH8Mbpc6N+d1u2eEgoV33f+4s=v-84gaobAodPtUw@mail.gmail.com>
     [not found]       ` <ajCm44rYpLOKCQ43@yjaykim-PowerEdge-T330>
2026-06-16 12:15         ` Nhat Pham

Reply instructions:

You may reply publicly to this message via plain-text email
using any one of the following methods:

* Save the following mbox file, import it into your mail client,
  and reply-to-all from there: mbox

  Avoid top-posting and favor interleaved quoting:
  https://en.wikipedia.org/wiki/Posting_style#Interleaved_style

* Reply using the --to, --cc, and --in-reply-to
  switches of git-send-email(1):

  git send-email \
    --in-reply-to=ajnNWRO7apBq2-kQ@google.com \
    --to=yosry@kernel.org \
    --cc=akpm@linux-foundation.org \
    --cc=axelrasmussen@google.com \
    --cc=baohua@kernel.org \
    --cc=baoquan.he@linux.dev \
    --cc=cgroups@vger.kernel.org \
    --cc=chengming.zhou@linux.dev \
    --cc=chrisl@kernel.org \
    --cc=david@kernel.org \
    --cc=gourry@gourry.net \
    --cc=hannes@cmpxchg.org \
    --cc=haowenchao22@gmail.com \
    --cc=kasong@tencent.com \
    --cc=kernel-team@meta.com \
    --cc=liam@infradead.org \
    --cc=linux-kernel@vger.kernel.org \
    --cc=linux-mm@kvack.org \
    --cc=ljs@kernel.org \
    --cc=mhocko@kernel.org \
    --cc=muchun.song@linux.dev \
    --cc=nphamcs@gmail.com \
    --cc=qi.zheng@linux.dev \
    --cc=riel@surriel.com \
    --cc=roman.gushchin@linux.dev \
    --cc=rppt@kernel.org \
    --cc=shakeel.butt@linux.dev \
    --cc=shikemeng@huaweicloud.com \
    --cc=surenb@google.com \
    --cc=vbabka@kernel.org \
    --cc=weixugc@google.com \
    --cc=youngjun.park@lge.com \
    --cc=yuanchu@google.com \
    /path/to/YOUR_REPLY

  https://kernel.org/pub/software/scm/git/docs/git-send-email.html

* If your mail client supports setting the In-Reply-To header
  via mailto: links, try the mailto: link

Be sure your reply has a Subject: header at the top and a blank line before the message body.

This is a public inbox, see mirroring instructions
for how to clone and mirror all data and code used for this inbox