All of lore.kernel.org
 help / color / mirror / Atom feed
From: Hao Jia <jiahao.kernel@gmail.com>
To: akpm@linux-foundation.org, tj@kernel.org, hannes@cmpxchg.org,
	shakeel.butt@linux.dev, mhocko@kernel.org, yosry@kernel.org,
	mkoutny@suse.com, nphamcs@gmail.com, chengming.zhou@linux.dev,
	muchun.song@linux.dev, roman.gushchin@linux.dev
Cc: linux-mm@kvack.org, linux-kernel@vger.kernel.org,
	linux-doc@vger.kernel.org, Hao Jia <jiahao1@lixiang.com>
Subject: [PATCH v4 1/5] mm/zswap: Extend shrink_memcg() writeback capability
Date: Thu, 18 Jun 2026 12:48:53 +0800	[thread overview]
Message-ID: <20260618044857.69439-2-jiahao.kernel@gmail.com> (raw)
In-Reply-To: <20260618044857.69439-1-jiahao.kernel@gmail.com>

From: Hao Jia <jiahao1@lixiang.com>

Currently, shrink_memcg() writes back at most one entry per-node
during its traversal. This makes shrink_worker() inefficient, as
it must repeatedly re-enter shrink_memcg() to make any substantial
progress.

To address this, extend shrink_memcg() and rewrite its LRU iteration
logic to support batch writeback. Introduce the nr_to_writeback
parameter to support a writeback budget based on compressed size.
This enables batch writeback in the shrink_worker() path, while
maintaining a low writeback budget in the zswap_store() path.

Additionally, to prepare for future proactive writeback, update
the return value semantics of shrink_memcg(): a positive value now
represents the actual number of compressed bytes written back, 0
indicates that candidates existed but no writeback succeeded, and
a negative value represents an error code.

Suggested-by: Yosry Ahmed <yosry@kernel.org>
Signed-off-by: Hao Jia <jiahao1@lixiang.com>
---
 mm/zswap.c | 116 ++++++++++++++++++++++++++++++++++++++++++++---------
 1 file changed, 97 insertions(+), 19 deletions(-)

diff --git a/mm/zswap.c b/mm/zswap.c
index 761cd699e0a3..d7d031dee4cd 100644
--- a/mm/zswap.c
+++ b/mm/zswap.c
@@ -160,6 +160,11 @@ struct zswap_pool {
 	char tfm_name[CRYPTO_MAX_ALG_NAME];
 };
 
+struct zswap_shrink_walk_arg {
+	unsigned long bytes_written;
+	bool encountered_page_in_swapcache;
+};
+
 /* Global LRU lists shared by all zswap pools. */
 static struct list_lru zswap_list_lru;
 
@@ -1089,8 +1094,9 @@ static enum lru_status shrink_memcg_cb(struct list_head *item, struct list_lru_o
 				       void *arg)
 {
 	struct zswap_entry *entry = container_of(item, struct zswap_entry, lru);
-	bool *encountered_page_in_swapcache = (bool *)arg;
+	struct zswap_shrink_walk_arg *walk_arg = arg;
 	swp_entry_t swpentry;
+	unsigned int length;
 	enum lru_status ret = LRU_REMOVED_RETRY;
 	int writeback_result;
 
@@ -1135,8 +1141,13 @@ static enum lru_status shrink_memcg_cb(struct list_head *item, struct list_lru_o
 	 * Once the lru lock is dropped, the entry might get freed. The
 	 * swpentry is copied to the stack, and entry isn't deref'd again
 	 * until the entry is verified to still be alive in the tree.
+	 *
+	 * entry->length is also copied while the lock is held, because
+	 * zswap_writeback_entry() frees the entry on success and we still
+	 * need its compressed size to account for writeback.
 	 */
 	swpentry = entry->swpentry;
+	length = entry->length;
 
 	/*
 	 * It's safe to drop the lock here because we return either
@@ -1155,12 +1166,13 @@ static enum lru_status shrink_memcg_cb(struct list_head *item, struct list_lru_o
 		 * into the warmer region. We should terminate shrinking (if we're in the dynamic
 		 * shrinker context).
 		 */
-		if (writeback_result == -EEXIST && encountered_page_in_swapcache) {
+		if (writeback_result == -EEXIST) {
 			ret = LRU_STOP;
-			*encountered_page_in_swapcache = true;
+			walk_arg->encountered_page_in_swapcache = true;
 		}
 	} else {
 		zswap_written_back_pages++;
+		walk_arg->bytes_written += length;
 	}
 
 	return ret;
@@ -1169,8 +1181,11 @@ static enum lru_status shrink_memcg_cb(struct list_head *item, struct list_lru_o
 static unsigned long zswap_shrinker_scan(struct shrinker *shrinker,
 		struct shrink_control *sc)
 {
+	struct zswap_shrink_walk_arg walk_arg = {
+		.bytes_written = 0,
+		.encountered_page_in_swapcache = false,
+	};
 	unsigned long shrink_ret;
-	bool encountered_page_in_swapcache = false;
 
 	if (!zswap_shrinker_enabled ||
 			!mem_cgroup_zswap_writeback_enabled(sc->memcg)) {
@@ -1179,9 +1194,9 @@ static unsigned long zswap_shrinker_scan(struct shrinker *shrinker,
 	}
 
 	shrink_ret = list_lru_shrink_walk(&zswap_list_lru, sc, &shrink_memcg_cb,
-		&encountered_page_in_swapcache);
+		&walk_arg);
 
-	if (encountered_page_in_swapcache)
+	if (walk_arg.encountered_page_in_swapcache)
 		return SHRINK_STOP;
 
 	return shrink_ret ? shrink_ret : SHRINK_STOP;
@@ -1275,10 +1290,32 @@ static struct shrinker *zswap_alloc_shrinker(void)
 	return shrinker;
 }
 
-static int shrink_memcg(struct mem_cgroup *memcg)
-{
-	int nid, shrunk = 0, scanned = 0;
+/*
+ * The maximum acceptable scan cost factor for writing back
+ * PAGE_SIZE bytes of compressed data.
+ */
+#define ZSWAP_WB_SCAN_FACTOR	16UL
+#define NR_ZSWAP_WB_BATCH	64UL
 
+/*
+ * Iterate over the per-node zswap LRUs of @memcg in batches, writing back
+ * up to @nr_to_writeback * PAGE_SIZE bytes of compressed data.
+ *
+ * Return: The number of bytes written back, or -ENOENT if @memcg has
+ * writeback disabled, is a zombie cgroup, or has empty zswap LRUs.
+ */
+static long shrink_memcg(struct mem_cgroup *memcg,
+			 unsigned long nr_to_writeback)
+{
+	struct zswap_shrink_walk_arg walk_arg = {
+		.bytes_written = 0,
+		.encountered_page_in_swapcache = false,
+	};
+	u64 bytes_to_writeback = nr_to_writeback << PAGE_SHIFT;
+	bool memcg_list_is_empty = true;
+	int nid;
+
+	/* Memcg with zswap writeback disabled are not candidates. */
 	if (!mem_cgroup_zswap_writeback_enabled(memcg))
 		return -ENOENT;
 
@@ -1290,24 +1327,65 @@ static int shrink_memcg(struct mem_cgroup *memcg)
 		return -ENOENT;
 
 	for_each_node_state(nid, N_NORMAL_MEMORY) {
-		unsigned long nr_to_walk = 1;
+		unsigned long nr_to_scan, nr_scanned = 0;
+		unsigned long remain;
+		walk_arg.encountered_page_in_swapcache = false;
+		/*
+		 * Cap by LRU length: bounds rewalks when referenced
+		 * entries keep rotating to the tail.
+		 */
+		nr_to_scan = list_lru_count_one(&zswap_list_lru, nid, memcg);
+		if (!nr_to_scan)
+			continue;
+		memcg_list_is_empty = false;
+
+		/*
+		 * Cap by SCAN_FACTOR * remain budget: bounds scan cost
+		 * to the remaining writeback budget.
+		 */
+		remain = DIV_ROUND_UP(bytes_to_writeback - walk_arg.bytes_written, PAGE_SIZE);
+		nr_to_scan = min(nr_to_scan,
+				 remain * ZSWAP_WB_SCAN_FACTOR);
 
-		shrunk += list_lru_walk_one(&zswap_list_lru, nid, memcg,
-					    &shrink_memcg_cb, NULL, &nr_to_walk);
-		scanned += 1 - nr_to_walk;
+		while (nr_scanned < nr_to_scan) {
+			unsigned long nr_to_walk = min(NR_ZSWAP_WB_BATCH,
+						       nr_to_scan - nr_scanned);
+
+			/*
+			 * Account for the committed budget rather than the walker's
+			 * actual delta. If the list is emptied concurrently, the
+			 * walker visits nothing and nr_scanned would never advance.
+			 */
+			nr_scanned += nr_to_walk;
+
+			list_lru_walk_one(&zswap_list_lru, nid, memcg,
+					  &shrink_memcg_cb,
+					  &walk_arg,
+					  &nr_to_walk);
+
+			if (walk_arg.bytes_written >= bytes_to_writeback)
+				return walk_arg.bytes_written;
+
+			if (walk_arg.encountered_page_in_swapcache)
+				break;
+
+			cond_resched();
+		}
 	}
 
-	if (!scanned)
+	/* Return -ENOENT if all zswap LRU lists are empty. */
+	if (memcg_list_is_empty)
 		return -ENOENT;
 
-	return shrunk ? 0 : -EAGAIN;
+	return walk_arg.bytes_written;
 }
 
 static void shrink_worker(struct work_struct *w)
 {
 	struct mem_cgroup *memcg;
-	int ret, failures = 0, attempts = 0;
+	int failures = 0, attempts = 0;
 	unsigned long thr;
+	long ret;
 
 	/* Reclaim down to the accept threshold */
 	thr = zswap_accept_thr_pages();
@@ -1368,7 +1446,7 @@ static void shrink_worker(struct work_struct *w)
 			goto resched;
 		}
 
-		ret = shrink_memcg(memcg);
+		ret = shrink_memcg(memcg, NR_ZSWAP_WB_BATCH);
 		/* drop the extra reference */
 		mem_cgroup_put(memcg);
 
@@ -1382,7 +1460,7 @@ static void shrink_worker(struct work_struct *w)
 			continue;
 		++attempts;
 
-		if (ret && ++failures == MAX_RECLAIM_RETRIES)
+		if (ret <= 0 && ++failures == MAX_RECLAIM_RETRIES)
 			break;
 resched:
 		cond_resched();
@@ -1492,7 +1570,7 @@ bool zswap_store(struct folio *folio)
 	objcg = get_obj_cgroup_from_folio(folio);
 	if (objcg && !obj_cgroup_may_zswap(objcg)) {
 		memcg = get_mem_cgroup_from_objcg(objcg);
-		if (shrink_memcg(memcg)) {
+		if (shrink_memcg(memcg, 1) <= 0) {
 			mem_cgroup_put(memcg);
 			goto put_objcg;
 		}
-- 
2.34.1



  reply	other threads:[~2026-06-18  4:49 UTC|newest]

Thread overview: 6+ messages / expand[flat|nested]  mbox.gz  Atom feed  top
2026-06-18  4:48 [PATCH v4 0/5] mm/zswap: Implement per-cgroup proactive writeback Hao Jia
2026-06-18  4:48 ` Hao Jia [this message]
2026-06-18  4:48 ` [PATCH v4 2/5] mm/zswap: Factor writeback loop out of shrink_worker() Hao Jia
2026-06-18  4:48 ` [PATCH v4 3/5] mm/zswap: Implement proactive writeback Hao Jia
2026-06-18  4:48 ` [PATCH v4 4/5] mm/zswap: Add per-memcg stat for " Hao Jia
2026-06-18  4:48 ` [PATCH v4 5/5] selftests/cgroup: Add tests for zswap " Hao Jia

Reply instructions:

You may reply publicly to this message via plain-text email
using any one of the following methods:

* Save the following mbox file, import it into your mail client,
  and reply-to-all from there: mbox

  Avoid top-posting and favor interleaved quoting:
  https://en.wikipedia.org/wiki/Posting_style#Interleaved_style

* Reply using the --to, --cc, and --in-reply-to
  switches of git-send-email(1):

  git send-email \
    --in-reply-to=20260618044857.69439-2-jiahao.kernel@gmail.com \
    --to=jiahao.kernel@gmail.com \
    --cc=akpm@linux-foundation.org \
    --cc=chengming.zhou@linux.dev \
    --cc=hannes@cmpxchg.org \
    --cc=jiahao1@lixiang.com \
    --cc=linux-doc@vger.kernel.org \
    --cc=linux-kernel@vger.kernel.org \
    --cc=linux-mm@kvack.org \
    --cc=mhocko@kernel.org \
    --cc=mkoutny@suse.com \
    --cc=muchun.song@linux.dev \
    --cc=nphamcs@gmail.com \
    --cc=roman.gushchin@linux.dev \
    --cc=shakeel.butt@linux.dev \
    --cc=tj@kernel.org \
    --cc=yosry@kernel.org \
    /path/to/YOUR_REPLY

  https://kernel.org/pub/software/scm/git/docs/git-send-email.html

* If your mail client supports setting the In-Reply-To header
  via mailto: links, try the mailto: link
Be sure your reply has a Subject: header at the top and a blank line before the message body.
This is an external index of several public inboxes,
see mirroring instructions on how to clone and mirror
all data and code used by this external index.