Re: [PATCH v4 1/5] mm/zswap: Extend shrink_memcg() writeback capability

Linux Documentation
 help / color / mirror / Atom feed

From: Hao Jia <jiahao.kernel@gmail.com>
To: Yosry Ahmed <yosry@kernel.org>
Cc: akpm@linux-foundation.org, tj@kernel.org, hannes@cmpxchg.org,
	shakeel.butt@linux.dev, mhocko@kernel.org, mkoutny@suse.com,
	nphamcs@gmail.com, chengming.zhou@linux.dev,
	muchun.song@linux.dev, roman.gushchin@linux.dev,
	linux-mm@kvack.org, linux-kernel@vger.kernel.org,
	linux-doc@vger.kernel.org, Hao Jia <jiahao1@lixiang.com>
Subject: Re: [PATCH v4 1/5] mm/zswap: Extend shrink_memcg() writeback capability
Date: Tue, 23 Jun 2026 21:22:58 +0800	[thread overview]
Message-ID: <d0f05c35-457a-4b2c-6faa-7a83d4bdec01@gmail.com> (raw)
In-Reply-To: <ajnB8IZrFZwbIr9P@google.com>



On 2026/6/23 07:33, Yosry Ahmed wrote:
> On Thu, Jun 18, 2026 at 12:48:53PM +0800, Hao Jia wrote:
>> From: Hao Jia <jiahao1@lixiang.com>
>>
>> diff --git a/mm/zswap.c b/mm/zswap.c
>> index 761cd699e0a3..d7d031dee4cd 100644
>> --- a/mm/zswap.c
>> +++ b/mm/zswap.c
>> @@ -160,6 +160,11 @@ struct zswap_pool {
>>   	char tfm_name[CRYPTO_MAX_ALG_NAME];
>>   };
>>   
>> +struct zswap_shrink_walk_arg {
>> +	unsigned long bytes_written;
>> +	bool encountered_page_in_swapcache;
>> +};
>> +
>>   /* Global LRU lists shared by all zswap pools. */
>>   static struct list_lru zswap_list_lru;
>>   
>> @@ -1089,8 +1094,9 @@ static enum lru_status shrink_memcg_cb(struct list_head *item, struct list_lru_o
>>   				       void *arg)
>>   {
>>   	struct zswap_entry *entry = container_of(item, struct zswap_entry, lru);
>> -	bool *encountered_page_in_swapcache = (bool *)arg;
>> +	struct zswap_shrink_walk_arg *walk_arg = arg;
>>   	swp_entry_t swpentry;
>> +	unsigned int length;
>>   	enum lru_status ret = LRU_REMOVED_RETRY;
>>   	int writeback_result;
>>   
>> @@ -1135,8 +1141,13 @@ static enum lru_status shrink_memcg_cb(struct list_head *item, struct list_lru_o
>>   	 * Once the lru lock is dropped, the entry might get freed. The
>>   	 * swpentry is copied to the stack, and entry isn't deref'd again
>>   	 * until the entry is verified to still be alive in the tree.
>> +	 *
>> +	 * entry->length is also copied while the lock is held, because
>> +	 * zswap_writeback_entry() frees the entry on success and we still
>> +	 * need its compressed size to account for writeback.
> 
> Hmm that's unnecessary, just update "The swpentry is copied to the
> stack.." above to "Copy neded fields to the stack.." or something.

I'll do this, thanks.

> 
>>   	 */
>>   	swpentry = entry->swpentry;
>> +	length = entry->length;
>>   
>>   	/*
>>   	 * It's safe to drop the lock here because we return either
>> @@ -1155,12 +1166,13 @@ static enum lru_status shrink_memcg_cb(struct list_head *item, struct list_lru_o
>>   		 * into the warmer region. We should terminate shrinking (if we're in the dynamic
>>   		 * shrinker context).
>>   		 */
>> -		if (writeback_result == -EEXIST && encountered_page_in_swapcache) {
>> +		if (writeback_result == -EEXIST) {
>>   			ret = LRU_STOP;
>> -			*encountered_page_in_swapcache = true;
>> +			walk_arg->encountered_page_in_swapcache = true;
>>   		}
>>   	} else {
>>   		zswap_written_back_pages++;
>> +		walk_arg->bytes_written += length;
>>   	}
>>   
>>   	return ret;
>> @@ -1169,8 +1181,11 @@ static enum lru_status shrink_memcg_cb(struct list_head *item, struct list_lru_o
>>   static unsigned long zswap_shrinker_scan(struct shrinker *shrinker,
>>   		struct shrink_control *sc)
>>   {
>> +	struct zswap_shrink_walk_arg walk_arg = {
>> +		.bytes_written = 0,
>> +		.encountered_page_in_swapcache = false,
>> +	};
>>   	unsigned long shrink_ret;
>> -	bool encountered_page_in_swapcache = false;
>>   
>>   	if (!zswap_shrinker_enabled ||
>>   			!mem_cgroup_zswap_writeback_enabled(sc->memcg)) {
>> @@ -1179,9 +1194,9 @@ static unsigned long zswap_shrinker_scan(struct shrinker *shrinker,
>>   	}
>>   
>>   	shrink_ret = list_lru_shrink_walk(&zswap_list_lru, sc, &shrink_memcg_cb,
>> -		&encountered_page_in_swapcache);
>> +		&walk_arg);
>>   
>> -	if (encountered_page_in_swapcache)
>> +	if (walk_arg.encountered_page_in_swapcache)
>>   		return SHRINK_STOP;
>>   
>>   	return shrink_ret ? shrink_ret : SHRINK_STOP;
>> @@ -1275,10 +1290,32 @@ static struct shrinker *zswap_alloc_shrinker(void)
>>   	return shrinker;
>>   }
>>   
>> -static int shrink_memcg(struct mem_cgroup *memcg)
>> -{
>> -	int nid, shrunk = 0, scanned = 0;
>> +/*
>> + * The maximum acceptable scan cost factor for writing back
>> + * PAGE_SIZE bytes of compressed data.
>> + */
>> +#define ZSWAP_WB_SCAN_FACTOR	16UL
>> +#define NR_ZSWAP_WB_BATCH	64UL
>>   
>> +/*
>> + * Iterate over the per-node zswap LRUs of @memcg in batches, writing back
>> + * up to @nr_to_writeback * PAGE_SIZE bytes of compressed data.
>> + *
>> + * Return: The number of bytes written back, or -ENOENT if @memcg has
>> + * writeback disabled, is a zombie cgroup, or has empty zswap LRUs.
>> + */
>> +static long shrink_memcg(struct mem_cgroup *memcg,
>> +			 unsigned long nr_to_writeback)
> 
> 
> Is nr_to_writeback supposed to be the number of pages we want to
> writeback (regardless of their compressed size), or the compressed bytes
> we want to writeback divided by PAGE_SIZE?
> 
> The way it's being used below seems like it's the latter, but the batch
> size should be in terms of scanned pages (i.e. uncompressed pages). So
> this is confusing.
> 
> The zswap_store() path expects to reclaim one uncompressed page, but
> this will reclaim PAGE_SIZE worth of compressed memory when passing 1
> IIUC (actually maybe more, see below).
> 
>> +{
>> +	struct zswap_shrink_walk_arg walk_arg = {
>> +		.bytes_written = 0,
>> +		.encountered_page_in_swapcache = false,
>> +	};
>> +	u64 bytes_to_writeback = nr_to_writeback << PAGE_SHIFT;
>> +	bool memcg_list_is_empty = true;
>> +	int nid;
>> +
>> +	/* Memcg with zswap writeback disabled are not candidates. */
> 
> The comment is unnecessary here, it should be obvious.

I'll do this, thanks.
> 
>>   	if (!mem_cgroup_zswap_writeback_enabled(memcg))
>>   		return -ENOENT;
>>   
>> @@ -1290,24 +1327,65 @@ static int shrink_memcg(struct mem_cgroup *memcg)
>>   		return -ENOENT;
>>   
>>   	for_each_node_state(nid, N_NORMAL_MEMORY) {
>> -		unsigned long nr_to_walk = 1;
>> +		unsigned long nr_to_scan, nr_scanned = 0;
>> +		unsigned long remain;
>> +		walk_arg.encountered_page_in_swapcache = false;
>> +		/*
>> +		 * Cap by LRU length: bounds rewalks when referenced
>> +		 * entries keep rotating to the tail.
>> +		 */
>> +		nr_to_scan = list_lru_count_one(&zswap_list_lru, nid, memcg);
>> +		if (!nr_to_scan)
>> +			continue;
> 
> Hmm generally if we are running out of pages to scan then we should scan
> the rotated entries, and reclaim them on the second pass, right? So this
> should be working as intended. But I guess this doesn't work well when
> iterating multiple memcgs, as we don't want to drain referenced entries
> in one memcg before reclaiming already rotated entries on another.
> 
> So I think the assumption here is that the caller will retry if needed,
> handling balancing scanning between multiple memcgs if needed. Maybe we
> should document this in the function doc above? We should explain that
> referenced entries will be rotated but not reclaimed as part of the same
> call.
> 
>> +		memcg_list_is_empty = false;
>> +
>> +		/*
>> +		 * Cap by SCAN_FACTOR * remain budget: bounds scan cost
>> +		 * to the remaining writeback budget.
>> +		 */
>> +		remain = DIV_ROUND_UP(bytes_to_writeback - walk_arg.bytes_written, PAGE_SIZE);
>> +		nr_to_scan = min(nr_to_scan,
>> +				 remain * ZSWAP_WB_SCAN_FACTOR);
> 
> For the zswap_store() path bytes_to_writeback=PAGE_SIZE, so remain will
> initially be 1. But then we multiply by this factor and now to scan 16
> pages? Also, where did this factor and equation come from?
> 
> We'll also loop over nodes, so we may end up scanning 32 or more pages
> depending on the number of nodes in the system.
> 
> If this is just a heuristic, we should really just start simple and add
> heuristics later as needed. The caller should probably pass in the
> number of pages to scan (i.e. uncompressed pages), and leave it to the
> caller to decide when to retry if the actual memory savings are
> realized.
> 
>>   
>> -		shrunk += list_lru_walk_one(&zswap_list_lru, nid, memcg,
>> -					    &shrink_memcg_cb, NULL, &nr_to_walk);
>> -		scanned += 1 - nr_to_walk;
>> +		while (nr_scanned < nr_to_scan) {
>> +			unsigned long nr_to_walk = min(NR_ZSWAP_WB_BATCH,
>> +						       nr_to_scan - nr_scanned);
>> +
>> +			/*
>> +			 * Account for the committed budget rather than the walker's
>> +			 * actual delta. If the list is emptied concurrently, the
>> +			 * walker visits nothing and nr_scanned would never advance.
>> +			 */
>> +			nr_scanned += nr_to_walk;
>> +
>> +			list_lru_walk_one(&zswap_list_lru, nid, memcg,
>> +					  &shrink_memcg_cb,
>> +					  &walk_arg,
>> +					  &nr_to_walk);
>> +
>> +			if (walk_arg.bytes_written >= bytes_to_writeback)
>> +				return walk_arg.bytes_written;
>> +
>> +			if (walk_arg.encountered_page_in_swapcache)
>> +				break;
>> +
>> +			cond_resched();
>> +		}
> 
> If the caller is expected to have a retry loop anyway, should we
> simplify this and just scan each per-node LRU once?
> 
> We should also probably bail early if the number of scanned pages has
> already been reached? Currently shrink_memcg() scans one page at a time,
> so if it scans a bit more to balance between the nodes it's probably
> fine.
> 
> But with batching, we could end up scanning hundres of extra pages just
> to balance between all nodes. Is node imbalance a real issue?
> 

My initial thought was that if cold memory is evenly distributed across 
nodes and we are doing a large writeback, it would be better to balance 
the zswap entry writeback across all nodes rather than just draining 
node 0 first. However, since we currently lack a proper metric to 
represent hot/cold memory (such as age-based tracking), doing this 
probably doesn't make much sense right now.

So, perhaps we want something like this? Please correct me if I'm wrong.

static long shrink_memcg(struct mem_cgroup *memcg,
        unsigned long nr_to_scan)
{
   struct zswap_shrink_walk_arg walk_arg = {
     .bytes_written = 0,
     .encountered_page_in_swapcache = false,
   };
   unsigned long nr_remaining = nr_to_scan;
   bool memcg_list_is_empty = true;
   int nid;

   if (!mem_cgroup_zswap_writeback_enabled(memcg))
     return -ENOENT;

   if (memcg && !mem_cgroup_online(memcg))
     return -ENOENT;

   for_each_node_state(nid, N_NORMAL_MEMORY) {
     unsigned long nr_to_walk;

     /*
      * Cap the per-node scan by the current LRU length. A referenced
      * entry is only rotated to the tail (second chance) and may be
      * revisited within a single walk; without this cap those rotated
      * entries could drain the shared scan budget on one node.
      */
     nr_to_walk = min(nr_remaining,
          list_lru_count_one(&zswap_list_lru, nid, memcg));
     if (!nr_to_walk)
       continue;
     memcg_list_is_empty = false;

     nr_remaining -= nr_to_walk;
     list_lru_walk_one(&zswap_list_lru, nid, memcg,
           &shrink_memcg_cb, &walk_arg, &nr_to_walk);
     /* Return the unused share of the budget to the pool. */
     nr_remaining += nr_to_walk;

     /* Bail out once the whole scan budget has been spent. */
     if (!nr_remaining)
       break;

     cond_resched();
   }

   if (memcg_list_is_empty)
     return -ENOENT;

   return walk_arg.bytes_written;
}

Thanks,
Hao

next prev parent reply	other threads:[~2026-06-23 13:23 UTC|newest]

Thread overview: 16+ messages / expand[flat|nested]  mbox.gz  Atom feed  top
2026-06-18  4:48 [PATCH v4 0/5] mm/zswap: Implement per-cgroup proactive writeback Hao Jia
2026-06-18  4:48 ` [PATCH v4 1/5] mm/zswap: Extend shrink_memcg() writeback capability Hao Jia
2026-06-22 23:33   ` Yosry Ahmed
2026-06-23 13:22     ` Hao Jia [this message]
2026-06-23 18:17       ` Yosry Ahmed
2026-06-18  4:48 ` [PATCH v4 2/5] mm/zswap: Factor writeback loop out of shrink_worker() Hao Jia
2026-06-22 23:36   ` Yosry Ahmed
2026-06-18  4:48 ` [PATCH v4 3/5] mm/zswap: Implement proactive writeback Hao Jia
2026-06-22 23:40   ` Yosry Ahmed
2026-06-18  4:48 ` [PATCH v4 4/5] mm/zswap: Add per-memcg stat for " Hao Jia
2026-06-22 23:42   ` Yosry Ahmed
2026-06-18  4:48 ` [PATCH v4 5/5] selftests/cgroup: Add tests for zswap " Hao Jia
2026-06-21  4:20 ` [PATCH v4 0/5] mm/zswap: Implement per-cgroup " Muchun Song
2026-06-22  6:08   ` Hao Jia
2026-06-22 10:04     ` Youngjun Park
2026-06-22 21:29       ` Yosry Ahmed

Reply instructions:

You may reply publicly to this message via plain-text email
using any one of the following methods:

* Save the following mbox file, import it into your mail client,
  and reply-to-all from there: mbox

  Avoid top-posting and favor interleaved quoting:
  https://en.wikipedia.org/wiki/Posting_style#Interleaved_style

* Reply using the --to, --cc, and --in-reply-to
  switches of git-send-email(1):

  git send-email \
    --in-reply-to=d0f05c35-457a-4b2c-6faa-7a83d4bdec01@gmail.com \
    --to=jiahao.kernel@gmail.com \
    --cc=akpm@linux-foundation.org \
    --cc=chengming.zhou@linux.dev \
    --cc=hannes@cmpxchg.org \
    --cc=jiahao1@lixiang.com \
    --cc=linux-doc@vger.kernel.org \
    --cc=linux-kernel@vger.kernel.org \
    --cc=linux-mm@kvack.org \
    --cc=mhocko@kernel.org \
    --cc=mkoutny@suse.com \
    --cc=muchun.song@linux.dev \
    --cc=nphamcs@gmail.com \
    --cc=roman.gushchin@linux.dev \
    --cc=shakeel.butt@linux.dev \
    --cc=tj@kernel.org \
    --cc=yosry@kernel.org \
    /path/to/YOUR_REPLY

  https://kernel.org/pub/software/scm/git/docs/git-send-email.html

* If your mail client supports setting the In-Reply-To header
  via mailto: links, try the mailto: link

Be sure your reply has a Subject: header at the top and a blank line before the message body.

This is a public inbox, see mirroring instructions
for how to clone and mirror all data and code used for this inbox