Re: [PATCH v4 1/5] mm/zswap: Extend shrink_memcg() writeback capability

All of lore.kernel.org
 help / color / mirror / Atom feed

From: Yosry Ahmed <yosry@kernel.org>
To: Hao Jia <jiahao.kernel@gmail.com>
Cc: akpm@linux-foundation.org, tj@kernel.org, hannes@cmpxchg.org,
	 shakeel.butt@linux.dev, mhocko@kernel.org, mkoutny@suse.com,
	nphamcs@gmail.com,  chengming.zhou@linux.dev,
	muchun.song@linux.dev, roman.gushchin@linux.dev,
	 linux-mm@kvack.org, linux-kernel@vger.kernel.org,
	linux-doc@vger.kernel.org,  Hao Jia <jiahao1@lixiang.com>
Subject: Re: [PATCH v4 1/5] mm/zswap: Extend shrink_memcg() writeback capability
Date: Mon, 22 Jun 2026 23:33:10 +0000	[thread overview]
Message-ID: <ajnB8IZrFZwbIr9P@google.com> (raw)
In-Reply-To: <20260618044857.69439-2-jiahao.kernel@gmail.com>

On Thu, Jun 18, 2026 at 12:48:53PM +0800, Hao Jia wrote:
> From: Hao Jia <jiahao1@lixiang.com>
> 
> Currently, shrink_memcg() writes back at most one entry per-node
> during its traversal. This makes shrink_worker() inefficient, as
> it must repeatedly re-enter shrink_memcg() to make any substantial
> progress.
> 
> To address this, extend shrink_memcg() and rewrite its LRU iteration
> logic to support batch writeback. Introduce the nr_to_writeback
> parameter to support a writeback budget based on compressed size.
> This enables batch writeback in the shrink_worker() path, while
> maintaining a low writeback budget in the zswap_store() path.
> 
> Additionally, to prepare for future proactive writeback, update
> the return value semantics of shrink_memcg(): a positive value now
> represents the actual number of compressed bytes written back, 0
> indicates that candidates existed but no writeback succeeded, and
> a negative value represents an error code.
> 
> Suggested-by: Yosry Ahmed <yosry@kernel.org>
> Signed-off-by: Hao Jia <jiahao1@lixiang.com>
> ---
>  mm/zswap.c | 116 ++++++++++++++++++++++++++++++++++++++++++++---------
>  1 file changed, 97 insertions(+), 19 deletions(-)
> 
> diff --git a/mm/zswap.c b/mm/zswap.c
> index 761cd699e0a3..d7d031dee4cd 100644
> --- a/mm/zswap.c
> +++ b/mm/zswap.c
> @@ -160,6 +160,11 @@ struct zswap_pool {
>  	char tfm_name[CRYPTO_MAX_ALG_NAME];
>  };
>  
> +struct zswap_shrink_walk_arg {
> +	unsigned long bytes_written;
> +	bool encountered_page_in_swapcache;
> +};
> +
>  /* Global LRU lists shared by all zswap pools. */
>  static struct list_lru zswap_list_lru;
>  
> @@ -1089,8 +1094,9 @@ static enum lru_status shrink_memcg_cb(struct list_head *item, struct list_lru_o
>  				       void *arg)
>  {
>  	struct zswap_entry *entry = container_of(item, struct zswap_entry, lru);
> -	bool *encountered_page_in_swapcache = (bool *)arg;
> +	struct zswap_shrink_walk_arg *walk_arg = arg;
>  	swp_entry_t swpentry;
> +	unsigned int length;
>  	enum lru_status ret = LRU_REMOVED_RETRY;
>  	int writeback_result;
>  
> @@ -1135,8 +1141,13 @@ static enum lru_status shrink_memcg_cb(struct list_head *item, struct list_lru_o
>  	 * Once the lru lock is dropped, the entry might get freed. The
>  	 * swpentry is copied to the stack, and entry isn't deref'd again
>  	 * until the entry is verified to still be alive in the tree.
> +	 *
> +	 * entry->length is also copied while the lock is held, because
> +	 * zswap_writeback_entry() frees the entry on success and we still
> +	 * need its compressed size to account for writeback.

Hmm that's unnecessary, just update "The swpentry is copied to the
stack.." above to "Copy neded fields to the stack.." or something.

>  	 */
>  	swpentry = entry->swpentry;
> +	length = entry->length;
>  
>  	/*
>  	 * It's safe to drop the lock here because we return either
> @@ -1155,12 +1166,13 @@ static enum lru_status shrink_memcg_cb(struct list_head *item, struct list_lru_o
>  		 * into the warmer region. We should terminate shrinking (if we're in the dynamic
>  		 * shrinker context).
>  		 */
> -		if (writeback_result == -EEXIST && encountered_page_in_swapcache) {
> +		if (writeback_result == -EEXIST) {
>  			ret = LRU_STOP;
> -			*encountered_page_in_swapcache = true;
> +			walk_arg->encountered_page_in_swapcache = true;
>  		}
>  	} else {
>  		zswap_written_back_pages++;
> +		walk_arg->bytes_written += length;
>  	}
>  
>  	return ret;
> @@ -1169,8 +1181,11 @@ static enum lru_status shrink_memcg_cb(struct list_head *item, struct list_lru_o
>  static unsigned long zswap_shrinker_scan(struct shrinker *shrinker,
>  		struct shrink_control *sc)
>  {
> +	struct zswap_shrink_walk_arg walk_arg = {
> +		.bytes_written = 0,
> +		.encountered_page_in_swapcache = false,
> +	};
>  	unsigned long shrink_ret;
> -	bool encountered_page_in_swapcache = false;
>  
>  	if (!zswap_shrinker_enabled ||
>  			!mem_cgroup_zswap_writeback_enabled(sc->memcg)) {
> @@ -1179,9 +1194,9 @@ static unsigned long zswap_shrinker_scan(struct shrinker *shrinker,
>  	}
>  
>  	shrink_ret = list_lru_shrink_walk(&zswap_list_lru, sc, &shrink_memcg_cb,
> -		&encountered_page_in_swapcache);
> +		&walk_arg);
>  
> -	if (encountered_page_in_swapcache)
> +	if (walk_arg.encountered_page_in_swapcache)
>  		return SHRINK_STOP;
>  
>  	return shrink_ret ? shrink_ret : SHRINK_STOP;
> @@ -1275,10 +1290,32 @@ static struct shrinker *zswap_alloc_shrinker(void)
>  	return shrinker;
>  }
>  
> -static int shrink_memcg(struct mem_cgroup *memcg)
> -{
> -	int nid, shrunk = 0, scanned = 0;
> +/*
> + * The maximum acceptable scan cost factor for writing back
> + * PAGE_SIZE bytes of compressed data.
> + */
> +#define ZSWAP_WB_SCAN_FACTOR	16UL
> +#define NR_ZSWAP_WB_BATCH	64UL
>  
> +/*
> + * Iterate over the per-node zswap LRUs of @memcg in batches, writing back
> + * up to @nr_to_writeback * PAGE_SIZE bytes of compressed data.
> + *
> + * Return: The number of bytes written back, or -ENOENT if @memcg has
> + * writeback disabled, is a zombie cgroup, or has empty zswap LRUs.
> + */
> +static long shrink_memcg(struct mem_cgroup *memcg,
> +			 unsigned long nr_to_writeback)


Is nr_to_writeback supposed to be the number of pages we want to
writeback (regardless of their compressed size), or the compressed bytes
we want to writeback divided by PAGE_SIZE?

The way it's being used below seems like it's the latter, but the batch
size should be in terms of scanned pages (i.e. uncompressed pages). So
this is confusing.

The zswap_store() path expects to reclaim one uncompressed page, but
this will reclaim PAGE_SIZE worth of compressed memory when passing 1
IIUC (actually maybe more, see below).

> +{
> +	struct zswap_shrink_walk_arg walk_arg = {
> +		.bytes_written = 0,
> +		.encountered_page_in_swapcache = false,
> +	};
> +	u64 bytes_to_writeback = nr_to_writeback << PAGE_SHIFT;
> +	bool memcg_list_is_empty = true;
> +	int nid;
> +
> +	/* Memcg with zswap writeback disabled are not candidates. */

The comment is unnecessary here, it should be obvious.

>  	if (!mem_cgroup_zswap_writeback_enabled(memcg))
>  		return -ENOENT;
>  
> @@ -1290,24 +1327,65 @@ static int shrink_memcg(struct mem_cgroup *memcg)
>  		return -ENOENT;
>  
>  	for_each_node_state(nid, N_NORMAL_MEMORY) {
> -		unsigned long nr_to_walk = 1;
> +		unsigned long nr_to_scan, nr_scanned = 0;
> +		unsigned long remain;
> +		walk_arg.encountered_page_in_swapcache = false;
> +		/*
> +		 * Cap by LRU length: bounds rewalks when referenced
> +		 * entries keep rotating to the tail.
> +		 */
> +		nr_to_scan = list_lru_count_one(&zswap_list_lru, nid, memcg);
> +		if (!nr_to_scan)
> +			continue;

Hmm generally if we are running out of pages to scan then we should scan
the rotated entries, and reclaim them on the second pass, right? So this
should be working as intended. But I guess this doesn't work well when
iterating multiple memcgs, as we don't want to drain referenced entries
in one memcg before reclaiming already rotated entries on another.

So I think the assumption here is that the caller will retry if needed,
handling balancing scanning between multiple memcgs if needed. Maybe we
should document this in the function doc above? We should explain that
referenced entries will be rotated but not reclaimed as part of the same
call.

> +		memcg_list_is_empty = false;
> +
> +		/*
> +		 * Cap by SCAN_FACTOR * remain budget: bounds scan cost
> +		 * to the remaining writeback budget.
> +		 */
> +		remain = DIV_ROUND_UP(bytes_to_writeback - walk_arg.bytes_written, PAGE_SIZE);
> +		nr_to_scan = min(nr_to_scan,
> +				 remain * ZSWAP_WB_SCAN_FACTOR);

For the zswap_store() path bytes_to_writeback=PAGE_SIZE, so remain will
initially be 1. But then we multiply by this factor and now to scan 16
pages? Also, where did this factor and equation come from?

We'll also loop over nodes, so we may end up scanning 32 or more pages
depending on the number of nodes in the system.

If this is just a heuristic, we should really just start simple and add
heuristics later as needed. The caller should probably pass in the
number of pages to scan (i.e. uncompressed pages), and leave it to the
caller to decide when to retry if the actual memory savings are
realized.

>  
> -		shrunk += list_lru_walk_one(&zswap_list_lru, nid, memcg,
> -					    &shrink_memcg_cb, NULL, &nr_to_walk);
> -		scanned += 1 - nr_to_walk;
> +		while (nr_scanned < nr_to_scan) {
> +			unsigned long nr_to_walk = min(NR_ZSWAP_WB_BATCH,
> +						       nr_to_scan - nr_scanned);
> +
> +			/*
> +			 * Account for the committed budget rather than the walker's
> +			 * actual delta. If the list is emptied concurrently, the
> +			 * walker visits nothing and nr_scanned would never advance.
> +			 */
> +			nr_scanned += nr_to_walk;
> +
> +			list_lru_walk_one(&zswap_list_lru, nid, memcg,
> +					  &shrink_memcg_cb,
> +					  &walk_arg,
> +					  &nr_to_walk);
> +
> +			if (walk_arg.bytes_written >= bytes_to_writeback)
> +				return walk_arg.bytes_written;
> +
> +			if (walk_arg.encountered_page_in_swapcache)
> +				break;
> +
> +			cond_resched();
> +		}

If the caller is expected to have a retry loop anyway, should we
simplify this and just scan each per-node LRU once?

We should also probably bail early if the number of scanned pages has
already been reached? Currently shrink_memcg() scans one page at a time,
so if it scans a bit more to balance between the nodes it's probably
fine.

But with batching, we could end up scanning hundres of extra pages just
to balance between all nodes. Is node imbalance a real issue?

>  	}
>  
> -	if (!scanned)
> +	/* Return -ENOENT if all zswap LRU lists are empty. */
> +	if (memcg_list_is_empty)
>  		return -ENOENT;
>  
> -	return shrunk ? 0 : -EAGAIN;
> +	return walk_arg.bytes_written;
>  }
>  
>  static void shrink_worker(struct work_struct *w)
>  {
>  	struct mem_cgroup *memcg;
> -	int ret, failures = 0, attempts = 0;
> +	int failures = 0, attempts = 0;
>  	unsigned long thr;
> +	long ret;
>  
>  	/* Reclaim down to the accept threshold */
>  	thr = zswap_accept_thr_pages();
> @@ -1368,7 +1446,7 @@ static void shrink_worker(struct work_struct *w)
>  			goto resched;
>  		}
>  
> -		ret = shrink_memcg(memcg);
> +		ret = shrink_memcg(memcg, NR_ZSWAP_WB_BATCH);
>  		/* drop the extra reference */
>  		mem_cgroup_put(memcg);
>  
> @@ -1382,7 +1460,7 @@ static void shrink_worker(struct work_struct *w)
>  			continue;
>  		++attempts;
>  
> -		if (ret && ++failures == MAX_RECLAIM_RETRIES)
> +		if (ret <= 0 && ++failures == MAX_RECLAIM_RETRIES)
>  			break;
>  resched:
>  		cond_resched();
> @@ -1492,7 +1570,7 @@ bool zswap_store(struct folio *folio)
>  	objcg = get_obj_cgroup_from_folio(folio);
>  	if (objcg && !obj_cgroup_may_zswap(objcg)) {
>  		memcg = get_mem_cgroup_from_objcg(objcg);
> -		if (shrink_memcg(memcg)) {
> +		if (shrink_memcg(memcg, 1) <= 0) {
>  			mem_cgroup_put(memcg);
>  			goto put_objcg;
>  		}
> -- 
> 2.34.1
>

next prev parent reply	other threads:[~2026-06-22 23:33 UTC|newest]

Thread overview: 14+ messages / expand[flat|nested]  mbox.gz  Atom feed  top
2026-06-18  4:48 [PATCH v4 0/5] mm/zswap: Implement per-cgroup proactive writeback Hao Jia
2026-06-18  4:48 ` [PATCH v4 1/5] mm/zswap: Extend shrink_memcg() writeback capability Hao Jia
2026-06-22 23:33   ` Yosry Ahmed [this message]
2026-06-18  4:48 ` [PATCH v4 2/5] mm/zswap: Factor writeback loop out of shrink_worker() Hao Jia
2026-06-22 23:36   ` Yosry Ahmed
2026-06-18  4:48 ` [PATCH v4 3/5] mm/zswap: Implement proactive writeback Hao Jia
2026-06-22 23:40   ` Yosry Ahmed
2026-06-18  4:48 ` [PATCH v4 4/5] mm/zswap: Add per-memcg stat for " Hao Jia
2026-06-22 23:42   ` Yosry Ahmed
2026-06-18  4:48 ` [PATCH v4 5/5] selftests/cgroup: Add tests for zswap " Hao Jia
2026-06-21  4:20 ` [PATCH v4 0/5] mm/zswap: Implement per-cgroup " Muchun Song
2026-06-22  6:08   ` Hao Jia
2026-06-22 10:04     ` Youngjun Park
2026-06-22 21:29       ` Yosry Ahmed

Reply instructions:

You may reply publicly to this message via plain-text email
using any one of the following methods:

* Save the following mbox file, import it into your mail client,
  and reply-to-all from there: mbox

  Avoid top-posting and favor interleaved quoting:
  https://en.wikipedia.org/wiki/Posting_style#Interleaved_style

* Reply using the --to, --cc, and --in-reply-to
  switches of git-send-email(1):

  git send-email \
    --in-reply-to=ajnB8IZrFZwbIr9P@google.com \
    --to=yosry@kernel.org \
    --cc=akpm@linux-foundation.org \
    --cc=chengming.zhou@linux.dev \
    --cc=hannes@cmpxchg.org \
    --cc=jiahao.kernel@gmail.com \
    --cc=jiahao1@lixiang.com \
    --cc=linux-doc@vger.kernel.org \
    --cc=linux-kernel@vger.kernel.org \
    --cc=linux-mm@kvack.org \
    --cc=mhocko@kernel.org \
    --cc=mkoutny@suse.com \
    --cc=muchun.song@linux.dev \
    --cc=nphamcs@gmail.com \
    --cc=roman.gushchin@linux.dev \
    --cc=shakeel.butt@linux.dev \
    --cc=tj@kernel.org \
    /path/to/YOUR_REPLY

  https://kernel.org/pub/software/scm/git/docs/git-send-email.html

* If your mail client supports setting the In-Reply-To header
  via mailto: links, try the mailto: link

Be sure your reply has a Subject: header at the top and a blank line before the message body.

This is an external index of several public inboxes,
see mirroring instructions on how to clone and mirror
all data and code used by this external index.