From: Hao Jia <jiahao.kernel@gmail.com>
To: Yosry Ahmed <yosry@kernel.org>
Cc: akpm@linux-foundation.org, tj@kernel.org, hannes@cmpxchg.org,
shakeel.butt@linux.dev, mhocko@kernel.org, mkoutny@suse.com,
nphamcs@gmail.com, chengming.zhou@linux.dev,
muchun.song@linux.dev, roman.gushchin@linux.dev,
linux-mm@kvack.org, linux-kernel@vger.kernel.org,
linux-doc@vger.kernel.org, Hao Jia <jiahao1@lixiang.com>
Subject: Re: [PATCH v4 1/5] mm/zswap: Extend shrink_memcg() writeback capability
Date: Tue, 23 Jun 2026 21:22:58 +0800 [thread overview]
Message-ID: <d0f05c35-457a-4b2c-6faa-7a83d4bdec01@gmail.com> (raw)
In-Reply-To: <ajnB8IZrFZwbIr9P@google.com>
On 2026/6/23 07:33, Yosry Ahmed wrote:
> On Thu, Jun 18, 2026 at 12:48:53PM +0800, Hao Jia wrote:
>> From: Hao Jia <jiahao1@lixiang.com>
>>
>> diff --git a/mm/zswap.c b/mm/zswap.c
>> index 761cd699e0a3..d7d031dee4cd 100644
>> --- a/mm/zswap.c
>> +++ b/mm/zswap.c
>> @@ -160,6 +160,11 @@ struct zswap_pool {
>> char tfm_name[CRYPTO_MAX_ALG_NAME];
>> };
>>
>> +struct zswap_shrink_walk_arg {
>> + unsigned long bytes_written;
>> + bool encountered_page_in_swapcache;
>> +};
>> +
>> /* Global LRU lists shared by all zswap pools. */
>> static struct list_lru zswap_list_lru;
>>
>> @@ -1089,8 +1094,9 @@ static enum lru_status shrink_memcg_cb(struct list_head *item, struct list_lru_o
>> void *arg)
>> {
>> struct zswap_entry *entry = container_of(item, struct zswap_entry, lru);
>> - bool *encountered_page_in_swapcache = (bool *)arg;
>> + struct zswap_shrink_walk_arg *walk_arg = arg;
>> swp_entry_t swpentry;
>> + unsigned int length;
>> enum lru_status ret = LRU_REMOVED_RETRY;
>> int writeback_result;
>>
>> @@ -1135,8 +1141,13 @@ static enum lru_status shrink_memcg_cb(struct list_head *item, struct list_lru_o
>> * Once the lru lock is dropped, the entry might get freed. The
>> * swpentry is copied to the stack, and entry isn't deref'd again
>> * until the entry is verified to still be alive in the tree.
>> + *
>> + * entry->length is also copied while the lock is held, because
>> + * zswap_writeback_entry() frees the entry on success and we still
>> + * need its compressed size to account for writeback.
>
> Hmm that's unnecessary, just update "The swpentry is copied to the
> stack.." above to "Copy neded fields to the stack.." or something.
I'll do this, thanks.
>
>> */
>> swpentry = entry->swpentry;
>> + length = entry->length;
>>
>> /*
>> * It's safe to drop the lock here because we return either
>> @@ -1155,12 +1166,13 @@ static enum lru_status shrink_memcg_cb(struct list_head *item, struct list_lru_o
>> * into the warmer region. We should terminate shrinking (if we're in the dynamic
>> * shrinker context).
>> */
>> - if (writeback_result == -EEXIST && encountered_page_in_swapcache) {
>> + if (writeback_result == -EEXIST) {
>> ret = LRU_STOP;
>> - *encountered_page_in_swapcache = true;
>> + walk_arg->encountered_page_in_swapcache = true;
>> }
>> } else {
>> zswap_written_back_pages++;
>> + walk_arg->bytes_written += length;
>> }
>>
>> return ret;
>> @@ -1169,8 +1181,11 @@ static enum lru_status shrink_memcg_cb(struct list_head *item, struct list_lru_o
>> static unsigned long zswap_shrinker_scan(struct shrinker *shrinker,
>> struct shrink_control *sc)
>> {
>> + struct zswap_shrink_walk_arg walk_arg = {
>> + .bytes_written = 0,
>> + .encountered_page_in_swapcache = false,
>> + };
>> unsigned long shrink_ret;
>> - bool encountered_page_in_swapcache = false;
>>
>> if (!zswap_shrinker_enabled ||
>> !mem_cgroup_zswap_writeback_enabled(sc->memcg)) {
>> @@ -1179,9 +1194,9 @@ static unsigned long zswap_shrinker_scan(struct shrinker *shrinker,
>> }
>>
>> shrink_ret = list_lru_shrink_walk(&zswap_list_lru, sc, &shrink_memcg_cb,
>> - &encountered_page_in_swapcache);
>> + &walk_arg);
>>
>> - if (encountered_page_in_swapcache)
>> + if (walk_arg.encountered_page_in_swapcache)
>> return SHRINK_STOP;
>>
>> return shrink_ret ? shrink_ret : SHRINK_STOP;
>> @@ -1275,10 +1290,32 @@ static struct shrinker *zswap_alloc_shrinker(void)
>> return shrinker;
>> }
>>
>> -static int shrink_memcg(struct mem_cgroup *memcg)
>> -{
>> - int nid, shrunk = 0, scanned = 0;
>> +/*
>> + * The maximum acceptable scan cost factor for writing back
>> + * PAGE_SIZE bytes of compressed data.
>> + */
>> +#define ZSWAP_WB_SCAN_FACTOR 16UL
>> +#define NR_ZSWAP_WB_BATCH 64UL
>>
>> +/*
>> + * Iterate over the per-node zswap LRUs of @memcg in batches, writing back
>> + * up to @nr_to_writeback * PAGE_SIZE bytes of compressed data.
>> + *
>> + * Return: The number of bytes written back, or -ENOENT if @memcg has
>> + * writeback disabled, is a zombie cgroup, or has empty zswap LRUs.
>> + */
>> +static long shrink_memcg(struct mem_cgroup *memcg,
>> + unsigned long nr_to_writeback)
>
>
> Is nr_to_writeback supposed to be the number of pages we want to
> writeback (regardless of their compressed size), or the compressed bytes
> we want to writeback divided by PAGE_SIZE?
>
> The way it's being used below seems like it's the latter, but the batch
> size should be in terms of scanned pages (i.e. uncompressed pages). So
> this is confusing.
>
> The zswap_store() path expects to reclaim one uncompressed page, but
> this will reclaim PAGE_SIZE worth of compressed memory when passing 1
> IIUC (actually maybe more, see below).
>
>> +{
>> + struct zswap_shrink_walk_arg walk_arg = {
>> + .bytes_written = 0,
>> + .encountered_page_in_swapcache = false,
>> + };
>> + u64 bytes_to_writeback = nr_to_writeback << PAGE_SHIFT;
>> + bool memcg_list_is_empty = true;
>> + int nid;
>> +
>> + /* Memcg with zswap writeback disabled are not candidates. */
>
> The comment is unnecessary here, it should be obvious.
I'll do this, thanks.
>
>> if (!mem_cgroup_zswap_writeback_enabled(memcg))
>> return -ENOENT;
>>
>> @@ -1290,24 +1327,65 @@ static int shrink_memcg(struct mem_cgroup *memcg)
>> return -ENOENT;
>>
>> for_each_node_state(nid, N_NORMAL_MEMORY) {
>> - unsigned long nr_to_walk = 1;
>> + unsigned long nr_to_scan, nr_scanned = 0;
>> + unsigned long remain;
>> + walk_arg.encountered_page_in_swapcache = false;
>> + /*
>> + * Cap by LRU length: bounds rewalks when referenced
>> + * entries keep rotating to the tail.
>> + */
>> + nr_to_scan = list_lru_count_one(&zswap_list_lru, nid, memcg);
>> + if (!nr_to_scan)
>> + continue;
>
> Hmm generally if we are running out of pages to scan then we should scan
> the rotated entries, and reclaim them on the second pass, right? So this
> should be working as intended. But I guess this doesn't work well when
> iterating multiple memcgs, as we don't want to drain referenced entries
> in one memcg before reclaiming already rotated entries on another.
>
> So I think the assumption here is that the caller will retry if needed,
> handling balancing scanning between multiple memcgs if needed. Maybe we
> should document this in the function doc above? We should explain that
> referenced entries will be rotated but not reclaimed as part of the same
> call.
>
>> + memcg_list_is_empty = false;
>> +
>> + /*
>> + * Cap by SCAN_FACTOR * remain budget: bounds scan cost
>> + * to the remaining writeback budget.
>> + */
>> + remain = DIV_ROUND_UP(bytes_to_writeback - walk_arg.bytes_written, PAGE_SIZE);
>> + nr_to_scan = min(nr_to_scan,
>> + remain * ZSWAP_WB_SCAN_FACTOR);
>
> For the zswap_store() path bytes_to_writeback=PAGE_SIZE, so remain will
> initially be 1. But then we multiply by this factor and now to scan 16
> pages? Also, where did this factor and equation come from?
>
> We'll also loop over nodes, so we may end up scanning 32 or more pages
> depending on the number of nodes in the system.
>
> If this is just a heuristic, we should really just start simple and add
> heuristics later as needed. The caller should probably pass in the
> number of pages to scan (i.e. uncompressed pages), and leave it to the
> caller to decide when to retry if the actual memory savings are
> realized.
>
>>
>> - shrunk += list_lru_walk_one(&zswap_list_lru, nid, memcg,
>> - &shrink_memcg_cb, NULL, &nr_to_walk);
>> - scanned += 1 - nr_to_walk;
>> + while (nr_scanned < nr_to_scan) {
>> + unsigned long nr_to_walk = min(NR_ZSWAP_WB_BATCH,
>> + nr_to_scan - nr_scanned);
>> +
>> + /*
>> + * Account for the committed budget rather than the walker's
>> + * actual delta. If the list is emptied concurrently, the
>> + * walker visits nothing and nr_scanned would never advance.
>> + */
>> + nr_scanned += nr_to_walk;
>> +
>> + list_lru_walk_one(&zswap_list_lru, nid, memcg,
>> + &shrink_memcg_cb,
>> + &walk_arg,
>> + &nr_to_walk);
>> +
>> + if (walk_arg.bytes_written >= bytes_to_writeback)
>> + return walk_arg.bytes_written;
>> +
>> + if (walk_arg.encountered_page_in_swapcache)
>> + break;
>> +
>> + cond_resched();
>> + }
>
> If the caller is expected to have a retry loop anyway, should we
> simplify this and just scan each per-node LRU once?
>
> We should also probably bail early if the number of scanned pages has
> already been reached? Currently shrink_memcg() scans one page at a time,
> so if it scans a bit more to balance between the nodes it's probably
> fine.
>
> But with batching, we could end up scanning hundres of extra pages just
> to balance between all nodes. Is node imbalance a real issue?
>
My initial thought was that if cold memory is evenly distributed across
nodes and we are doing a large writeback, it would be better to balance
the zswap entry writeback across all nodes rather than just draining
node 0 first. However, since we currently lack a proper metric to
represent hot/cold memory (such as age-based tracking), doing this
probably doesn't make much sense right now.
So, perhaps we want something like this? Please correct me if I'm wrong.
static long shrink_memcg(struct mem_cgroup *memcg,
unsigned long nr_to_scan)
{
struct zswap_shrink_walk_arg walk_arg = {
.bytes_written = 0,
.encountered_page_in_swapcache = false,
};
unsigned long nr_remaining = nr_to_scan;
bool memcg_list_is_empty = true;
int nid;
if (!mem_cgroup_zswap_writeback_enabled(memcg))
return -ENOENT;
if (memcg && !mem_cgroup_online(memcg))
return -ENOENT;
for_each_node_state(nid, N_NORMAL_MEMORY) {
unsigned long nr_to_walk;
/*
* Cap the per-node scan by the current LRU length. A referenced
* entry is only rotated to the tail (second chance) and may be
* revisited within a single walk; without this cap those rotated
* entries could drain the shared scan budget on one node.
*/
nr_to_walk = min(nr_remaining,
list_lru_count_one(&zswap_list_lru, nid, memcg));
if (!nr_to_walk)
continue;
memcg_list_is_empty = false;
nr_remaining -= nr_to_walk;
list_lru_walk_one(&zswap_list_lru, nid, memcg,
&shrink_memcg_cb, &walk_arg, &nr_to_walk);
/* Return the unused share of the budget to the pool. */
nr_remaining += nr_to_walk;
/* Bail out once the whole scan budget has been spent. */
if (!nr_remaining)
break;
cond_resched();
}
if (memcg_list_is_empty)
return -ENOENT;
return walk_arg.bytes_written;
}
Thanks,
Hao
next prev parent reply other threads:[~2026-06-23 13:23 UTC|newest]
Thread overview: 16+ messages / expand[flat|nested] mbox.gz Atom feed top
2026-06-18 4:48 [PATCH v4 0/5] mm/zswap: Implement per-cgroup proactive writeback Hao Jia
2026-06-18 4:48 ` [PATCH v4 1/5] mm/zswap: Extend shrink_memcg() writeback capability Hao Jia
2026-06-22 23:33 ` Yosry Ahmed
2026-06-23 13:22 ` Hao Jia [this message]
2026-06-23 18:17 ` Yosry Ahmed
2026-06-18 4:48 ` [PATCH v4 2/5] mm/zswap: Factor writeback loop out of shrink_worker() Hao Jia
2026-06-22 23:36 ` Yosry Ahmed
2026-06-18 4:48 ` [PATCH v4 3/5] mm/zswap: Implement proactive writeback Hao Jia
2026-06-22 23:40 ` Yosry Ahmed
2026-06-18 4:48 ` [PATCH v4 4/5] mm/zswap: Add per-memcg stat for " Hao Jia
2026-06-22 23:42 ` Yosry Ahmed
2026-06-18 4:48 ` [PATCH v4 5/5] selftests/cgroup: Add tests for zswap " Hao Jia
2026-06-21 4:20 ` [PATCH v4 0/5] mm/zswap: Implement per-cgroup " Muchun Song
2026-06-22 6:08 ` Hao Jia
2026-06-22 10:04 ` Youngjun Park
2026-06-22 21:29 ` Yosry Ahmed
Reply instructions:
You may reply publicly to this message via plain-text email
using any one of the following methods:
* Save the following mbox file, import it into your mail client,
and reply-to-all from there: mbox
Avoid top-posting and favor interleaved quoting:
https://en.wikipedia.org/wiki/Posting_style#Interleaved_style
* Reply using the --to, --cc, and --in-reply-to
switches of git-send-email(1):
git send-email \
--in-reply-to=d0f05c35-457a-4b2c-6faa-7a83d4bdec01@gmail.com \
--to=jiahao.kernel@gmail.com \
--cc=akpm@linux-foundation.org \
--cc=chengming.zhou@linux.dev \
--cc=hannes@cmpxchg.org \
--cc=jiahao1@lixiang.com \
--cc=linux-doc@vger.kernel.org \
--cc=linux-kernel@vger.kernel.org \
--cc=linux-mm@kvack.org \
--cc=mhocko@kernel.org \
--cc=mkoutny@suse.com \
--cc=muchun.song@linux.dev \
--cc=nphamcs@gmail.com \
--cc=roman.gushchin@linux.dev \
--cc=shakeel.butt@linux.dev \
--cc=tj@kernel.org \
--cc=yosry@kernel.org \
/path/to/YOUR_REPLY
https://kernel.org/pub/software/scm/git/docs/git-send-email.html
* If your mail client supports setting the In-Reply-To header
via mailto: links, try the mailto: link
Be sure your reply has a Subject: header at the top and a blank line
before the message body.
This is a public inbox, see mirroring instructions
for how to clone and mirror all data and code used for this inbox