From: Hao Jia <jiahao.kernel@gmail.com>
To: akpm@linux-foundation.org, tj@kernel.org, hannes@cmpxchg.org,
shakeel.butt@linux.dev, mhocko@kernel.org, yosry@kernel.org,
mkoutny@suse.com, nphamcs@gmail.com, chengming.zhou@linux.dev,
muchun.song@linux.dev, roman.gushchin@linux.dev
Cc: linux-mm@kvack.org, linux-kernel@vger.kernel.org,
linux-doc@vger.kernel.org, Hao Jia <jiahao1@lixiang.com>
Subject: [PATCH v5 2/6] mm/zswap: Support batch writeback in shrink_memcg()
Date: Mon, 29 Jun 2026 19:20:28 +0800 [thread overview]
Message-ID: <20260629112032.20423-3-jiahao.kernel@gmail.com> (raw)
In-Reply-To: <20260629112032.20423-1-jiahao.kernel@gmail.com>
From: Hao Jia <jiahao1@lixiang.com>
Currently, shrink_memcg() writes back at most one entry per-node during
its traversal. This makes shrink_worker() inefficient, as it must
repeatedly re-enter shrink_memcg() to make any substantial progress.
To address this, extend shrink_memcg() and rewrite its LRU iteration logic
to support batch writeback. Introduce the nr_to_scan parameter to bound how
many pages are scanned per call. This enables batch writeback in the
shrink_worker() path, while maintaining a low scan budget in the
zswap_store() path.
Additionally, to prepare for future proactive writeback, update the return
value semantics of shrink_memcg(): a positive value now represents the
actual number of compressed bytes written back, 0 indicates that candidates
existed but no writeback succeeded, and a negative value represents an
error code.
Suggested-by: Yosry Ahmed <yosry@kernel.org>
Signed-off-by: Hao Jia <jiahao1@lixiang.com>
---
mm/zswap.c | 89 ++++++++++++++++++++++++++++++++++++++++++------------
1 file changed, 69 insertions(+), 20 deletions(-)
diff --git a/mm/zswap.c b/mm/zswap.c
index 0f8f04f22888..e2c2a3f1e061 100644
--- a/mm/zswap.c
+++ b/mm/zswap.c
@@ -160,6 +160,11 @@ struct zswap_pool {
char tfm_name[CRYPTO_MAX_ALG_NAME];
};
+struct zswap_shrink_walk_arg {
+ unsigned long bytes_written;
+ bool encountered_page_in_swapcache;
+};
+
/* Global LRU lists shared by all zswap pools. */
static struct list_lru zswap_list_lru;
@@ -1089,8 +1094,9 @@ static enum lru_status shrink_memcg_cb(struct list_head *item, struct list_lru_o
void *arg)
{
struct zswap_entry *entry = container_of(item, struct zswap_entry, lru);
- bool *encountered_page_in_swapcache = (bool *)arg;
+ struct zswap_shrink_walk_arg *walk_arg = arg;
swp_entry_t swpentry;
+ unsigned int length;
enum lru_status ret = LRU_REMOVED_RETRY;
int writeback_result;
@@ -1133,10 +1139,11 @@ static enum lru_status shrink_memcg_cb(struct list_head *item, struct list_lru_o
/*
* Once the lru lock is dropped, the entry might get freed. The
- * swpentry is copied to the stack, and entry isn't deref'd again
- * until the entry is verified to still be alive in the tree.
+ * needed fields are copied to the stack, and entry isn't deref'd
+ * again until it is verified to still be alive in the tree.
*/
swpentry = entry->swpentry;
+ length = entry->length;
/*
* It's safe to drop the lock here because we return either
@@ -1155,12 +1162,13 @@ static enum lru_status shrink_memcg_cb(struct list_head *item, struct list_lru_o
* into the warmer region. We should terminate shrinking (if we're in the dynamic
* shrinker context).
*/
- if (writeback_result == -EEXIST && encountered_page_in_swapcache) {
+ if (writeback_result == -EEXIST) {
ret = LRU_STOP;
- *encountered_page_in_swapcache = true;
+ walk_arg->encountered_page_in_swapcache = true;
}
} else {
zswap_written_back_pages++;
+ walk_arg->bytes_written += length;
}
return ret;
@@ -1169,8 +1177,11 @@ static enum lru_status shrink_memcg_cb(struct list_head *item, struct list_lru_o
static unsigned long zswap_shrinker_scan(struct shrinker *shrinker,
struct shrink_control *sc)
{
+ struct zswap_shrink_walk_arg walk_arg = {
+ .bytes_written = 0,
+ .encountered_page_in_swapcache = false,
+ };
unsigned long shrink_ret;
- bool encountered_page_in_swapcache = false;
if (!zswap_shrinker_enabled ||
!mem_cgroup_zswap_writeback_enabled(sc->memcg)) {
@@ -1179,9 +1190,9 @@ static unsigned long zswap_shrinker_scan(struct shrinker *shrinker,
}
shrink_ret = list_lru_shrink_walk(&zswap_list_lru, sc, &shrink_memcg_cb,
- &encountered_page_in_swapcache);
+ &walk_arg);
- if (encountered_page_in_swapcache)
+ if (walk_arg.encountered_page_in_swapcache)
return SHRINK_STOP;
return shrink_ret ? shrink_ret : SHRINK_STOP;
@@ -1275,9 +1286,31 @@ static struct shrinker *zswap_alloc_shrinker(void)
return shrinker;
}
-static int shrink_memcg(struct mem_cgroup *memcg)
+#define NR_ZSWAP_WB_BATCH 64UL
+
+/*
+ * Scan up to @nr_to_scan pages across the per-node zswap LRUs of @memcg
+ * and write back the reclaimable ones.
+ *
+ * Since the second-chance algorithm rotates referenced entries to the
+ * LRU tail, the per-node scan is capped at the current LRU length so
+ * each entry is scanned at most once per call. It is up to the caller
+ * to handle retries, deciding whether to scan another memcg to complete
+ * the full iteration, or to rescan the current memcg to drain its zswap
+ * entries.
+ *
+ * Return: The number of compressed bytes written back (>= 0), or -ENOENT
+ * if @memcg has writeback disabled, is a zombie cgroup, or has empty
+ * zswap LRUs.
+ */
+static long shrink_memcg(struct mem_cgroup *memcg, unsigned long nr_to_scan)
{
- int nid, shrunk = 0, scanned = 0;
+ struct zswap_shrink_walk_arg walk_arg = {
+ .bytes_written = 0,
+ .encountered_page_in_swapcache = false,
+ };
+ unsigned long nr_remaining = nr_to_scan;
+ int nid;
if (!mem_cgroup_zswap_writeback_enabled(memcg))
return -ENOENT;
@@ -1290,24 +1323,40 @@ static int shrink_memcg(struct mem_cgroup *memcg)
return -ENOENT;
for_each_node_state(nid, N_NORMAL_MEMORY) {
- unsigned long nr_to_walk = 1;
+ unsigned long nr_to_walk;
- shrunk += list_lru_walk_one(&zswap_list_lru, nid, memcg,
- &shrink_memcg_cb, NULL, &nr_to_walk);
- scanned += 1 - nr_to_walk;
+ /*
+ * Cap the scan at per-node LRU length so each entry is scanned
+ * at most once per call.
+ */
+ nr_to_walk = min(nr_remaining,
+ list_lru_count_one(&zswap_list_lru, nid, memcg));
+ if (!nr_to_walk)
+ continue;
+
+ nr_remaining -= nr_to_walk;
+ list_lru_walk_one(&zswap_list_lru, nid, memcg, &shrink_memcg_cb,
+ &walk_arg, &nr_to_walk);
+ /* Return the unused share of the budget to the pool. */
+ nr_remaining += nr_to_walk;
+
+ if (!nr_remaining)
+ break;
}
- if (!scanned)
+ /* Nothing was scanned: every LRU under @memcg was empty. */
+ if (nr_remaining == nr_to_scan)
return -ENOENT;
- return shrunk ? 0 : -EAGAIN;
+ return walk_arg.bytes_written;
}
static void shrink_worker(struct work_struct *w)
{
struct mem_cgroup *memcg;
- int ret, failures = 0, attempts = 0;
+ int failures = 0, attempts = 0;
unsigned long thr;
+ long ret;
/* Reclaim down to the accept threshold */
thr = zswap_accept_thr_pages();
@@ -1373,7 +1422,7 @@ static void shrink_worker(struct work_struct *w)
goto resched;
}
- ret = shrink_memcg(memcg);
+ ret = shrink_memcg(memcg, NR_ZSWAP_WB_BATCH);
/* drop the extra reference */
mem_cgroup_put(memcg);
@@ -1394,7 +1443,7 @@ static void shrink_worker(struct work_struct *w)
}
++attempts;
- if (ret && ++failures == MAX_RECLAIM_RETRIES)
+ if (ret <= 0 && ++failures == MAX_RECLAIM_RETRIES)
break;
resched:
cond_resched();
@@ -1504,7 +1553,7 @@ bool zswap_store(struct folio *folio)
objcg = get_obj_cgroup_from_folio(folio);
if (objcg && !obj_cgroup_may_zswap(objcg)) {
memcg = get_mem_cgroup_from_objcg(objcg);
- if (shrink_memcg(memcg)) {
+ if (shrink_memcg(memcg, num_node_state(N_NORMAL_MEMORY)) <= 0) {
mem_cgroup_put(memcg);
goto put_objcg;
}
--
2.34.1
next prev parent reply other threads:[~2026-06-29 11:21 UTC|newest]
Thread overview: 20+ messages / expand[flat|nested] mbox.gz Atom feed top
2026-06-29 11:20 [PATCH v5 0/6] mm/zswap: Implement per-cgroup proactive writeback Hao Jia
2026-06-29 11:20 ` [PATCH v5 1/6] mm/zswap: Fix global shrinker when memory cgroup is disabled Hao Jia
2026-06-29 18:37 ` Nhat Pham
2026-06-30 10:51 ` Hao Jia
2026-06-30 16:02 ` Yosry Ahmed
2026-07-01 9:39 ` Hao Jia
2026-07-01 17:33 ` Nhat Pham
2026-06-29 11:20 ` Hao Jia [this message]
2026-06-30 0:21 ` [PATCH v5 2/6] mm/zswap: Support batch writeback in shrink_memcg() Yosry Ahmed
2026-06-30 1:18 ` Hao Jia
2026-06-29 11:20 ` [PATCH v5 3/6] mm/zswap: Extract a reusable writeback helper from shrink_worker() Hao Jia
2026-06-29 11:20 ` [PATCH v5 4/6] mm/zswap: Implement proactive writeback Hao Jia
2026-06-30 0:15 ` Yosry Ahmed
2026-06-30 1:49 ` Hao Jia
2026-06-30 16:10 ` Yosry Ahmed
2026-07-01 9:35 ` Hao Jia
2026-07-01 11:45 ` Hao Jia
2026-07-02 12:32 ` Hao Jia
2026-06-29 11:20 ` [PATCH v5 5/6] mm/zswap: Add per-memcg stat for " Hao Jia
2026-06-29 11:20 ` [PATCH v5 6/6] selftests/cgroup: Add tests for zswap " Hao Jia
Reply instructions:
You may reply publicly to this message via plain-text email
using any one of the following methods:
* Save the following mbox file, import it into your mail client,
and reply-to-all from there: mbox
Avoid top-posting and favor interleaved quoting:
https://en.wikipedia.org/wiki/Posting_style#Interleaved_style
* Reply using the --to, --cc, and --in-reply-to
switches of git-send-email(1):
git send-email \
--in-reply-to=20260629112032.20423-3-jiahao.kernel@gmail.com \
--to=jiahao.kernel@gmail.com \
--cc=akpm@linux-foundation.org \
--cc=chengming.zhou@linux.dev \
--cc=hannes@cmpxchg.org \
--cc=jiahao1@lixiang.com \
--cc=linux-doc@vger.kernel.org \
--cc=linux-kernel@vger.kernel.org \
--cc=linux-mm@kvack.org \
--cc=mhocko@kernel.org \
--cc=mkoutny@suse.com \
--cc=muchun.song@linux.dev \
--cc=nphamcs@gmail.com \
--cc=roman.gushchin@linux.dev \
--cc=shakeel.butt@linux.dev \
--cc=tj@kernel.org \
--cc=yosry@kernel.org \
/path/to/YOUR_REPLY
https://kernel.org/pub/software/scm/git/docs/git-send-email.html
* If your mail client supports setting the In-Reply-To header
via mailto: links, try the mailto: link
Be sure your reply has a Subject: header at the top and a blank line
before the message body.
This is an external index of several public inboxes,
see mirroring instructions on how to clone and mirror
all data and code used by this external index.