From: Wenchao Hao <haowenchao22@gmail.com>
To: Andrew Morton <akpm@linux-foundation.org>,
Barry Song <21cnbao@gmail.com>,
Chengming Zhou <chengming.zhou@linux.dev>,
Jens Axboe <axboe@kernel.dk>,
Johannes Weiner <hannes@cmpxchg.org>,
linux-block@vger.kernel.org, linux-kernel@vger.kernel.org,
linux-mm@kvack.org, Minchan Kim <minchan@kernel.org>,
Nhat Pham <nphamcs@gmail.com>,
Sergey Senozhatsky <senozhatsky@chromium.org>,
Yosry Ahmed <yosry@kernel.org>
Cc: Wenchao Hao <haowenchao22@gmail.com>,
Wenchao Hao <haowenchao@xiaomi.com>
Subject: [RFC PATCH v3 1/4] mm/zsmalloc: introduce deferred free framework with callback ops
Date: Fri, 8 May 2026 14:07:21 +0800 [thread overview]
Message-ID: <20260508060724.3810904-2-haowenchao@xiaomi.com> (raw)
In-Reply-To: <20260508060724.3810904-1-haowenchao@xiaomi.com>
Add a per-cpu deferred free mechanism to zsmalloc with a callback
interface that lets callers (zram, zswap) customize push and drain
behavior.
Each CPU owns a single-page buffer. The hot path (zs_free_deferred)
writes a value into the current CPU's buffer via the push callback
with preemption disabled — no locks, no atomics. When the buffer
fills, it is swapped with a fresh page from a pre-allocated page
pool and the full page is queued to a WQ_UNBOUND worker for drain.
The drain worker invokes the drain callback which performs the actual
expensive work (zs_free, slot_free, etc.) in batch, away from the
original hot path.
Page pool management:
- Pool is pre-allocated at enable time (ZS_DEFERRED_POOL_SIZE pages)
- Full buffers are drained and returned to the pool
- If no free page is available when buffer is full, the push falls
back to synchronous processing by the caller
Signed-off-by: Wenchao Hao <haowenchao@xiaomi.com>
---
include/linux/zsmalloc.h | 16 +++
mm/zsmalloc.c | 208 ++++++++++++++++++++++++++++++++++++++-
2 files changed, 223 insertions(+), 1 deletion(-)
diff --git a/include/linux/zsmalloc.h b/include/linux/zsmalloc.h
index 478410c880b1..8d6c675b10dc 100644
--- a/include/linux/zsmalloc.h
+++ b/include/linux/zsmalloc.h
@@ -24,12 +24,28 @@ struct zs_pool_stats {
struct zs_pool;
struct scatterlist;
+enum zs_push_ret {
+ ZS_PUSH_OK = 0,
+ ZS_PUSH_FULL,
+ ZS_PUSH_FULL_QUEUED,
+};
+
+struct zs_deferred_ops {
+ enum zs_push_ret (*push)(void *buf, unsigned int count,
+ unsigned long value);
+ void (*drain)(void *private, void *buf, unsigned int count);
+};
+
struct zs_pool *zs_create_pool(const char *name);
void zs_destroy_pool(struct zs_pool *pool);
unsigned long zs_malloc(struct zs_pool *pool, size_t size, gfp_t flags,
const int nid);
void zs_free(struct zs_pool *pool, unsigned long obj);
+int zs_pool_enable_deferred_free(struct zs_pool *pool,
+ const struct zs_deferred_ops *ops,
+ void *private);
+bool zs_free_deferred(struct zs_pool *pool, unsigned long value);
size_t zs_huge_class_size(struct zs_pool *pool);
diff --git a/mm/zsmalloc.c b/mm/zsmalloc.c
index 63128ddb7959..d8220a8753a7 100644
--- a/mm/zsmalloc.c
+++ b/mm/zsmalloc.c
@@ -196,6 +196,13 @@ struct link_free {
static struct kmem_cache *handle_cachep;
static struct kmem_cache *zspage_cachep;
+#define ZS_DEFERRED_POOL_SIZE (256 * 1024 / PAGE_SIZE)
+
+struct zs_deferred_percpu {
+ unsigned int count;
+ void *buf;
+};
+
struct zs_pool {
const char *name;
@@ -217,6 +224,18 @@ struct zs_pool {
/* protect zspage migration/compaction */
rwlock_t lock;
atomic_t compaction_in_progress;
+
+ /* per-cpu deferred free */
+ const struct zs_deferred_ops *deferred_ops;
+ void *deferred_private;
+ struct zs_deferred_percpu __percpu *deferred;
+ struct work_struct deferred_work;
+ struct workqueue_struct *deferred_wq;
+ struct list_head deferred_pool;
+ unsigned int deferred_pool_count;
+ spinlock_t deferred_pool_lock;
+ struct list_head deferred_drain_list;
+ spinlock_t deferred_drain_lock;
};
static inline void zpdesc_set_first(struct zpdesc *zpdesc)
@@ -1416,6 +1435,171 @@ void zs_free(struct zs_pool *pool, unsigned long handle)
}
EXPORT_SYMBOL_GPL(zs_free);
+static struct page *deferred_pool_get(struct zs_pool *pool)
+{
+ struct page *page = NULL;
+
+ spin_lock(&pool->deferred_pool_lock);
+ if (!list_empty(&pool->deferred_pool)) {
+ page = list_first_entry(&pool->deferred_pool, struct page, lru);
+ list_del(&page->lru);
+ pool->deferred_pool_count--;
+ }
+ spin_unlock(&pool->deferred_pool_lock);
+ return page;
+}
+
+static void deferred_pool_put(struct zs_pool *pool, struct page *page)
+{
+ spin_lock(&pool->deferred_pool_lock);
+ list_add_tail(&page->lru, &pool->deferred_pool);
+ pool->deferred_pool_count++;
+ spin_unlock(&pool->deferred_pool_lock);
+}
+
+static void zs_deferred_work_fn(struct work_struct *work)
+{
+ struct zs_pool *pool = container_of(work, struct zs_pool, deferred_work);
+ struct page *page;
+
+ while (true) {
+ unsigned int count;
+
+ spin_lock(&pool->deferred_drain_lock);
+ if (list_empty(&pool->deferred_drain_list)) {
+ spin_unlock(&pool->deferred_drain_lock);
+ break;
+ }
+ page = list_first_entry(&pool->deferred_drain_list,
+ struct page, lru);
+ list_del(&page->lru);
+ count = page_private(page);
+ spin_unlock(&pool->deferred_drain_lock);
+
+ pool->deferred_ops->drain(pool->deferred_private,
+ page_address(page), count);
+ deferred_pool_put(pool, page);
+ cond_resched();
+ }
+}
+
+bool zs_free_deferred(struct zs_pool *pool, unsigned long value)
+{
+ struct zs_deferred_percpu *def;
+ struct page *new_page, *full_page;
+ enum zs_push_ret ret;
+
+ if (!pool->deferred)
+ return false;
+
+ def = get_cpu_ptr(pool->deferred);
+
+ ret = pool->deferred_ops->push(def->buf, def->count, value);
+ if (ret == ZS_PUSH_OK) {
+ def->count++;
+ put_cpu_ptr(pool->deferred);
+ return true;
+ }
+
+ if (ret == ZS_PUSH_FULL_QUEUED)
+ def->count++;
+
+ new_page = deferred_pool_get(pool);
+ if (new_page) {
+ full_page = virt_to_page(def->buf);
+ set_page_private(full_page, def->count);
+ def->buf = page_address(new_page);
+ def->count = 0;
+
+ if (ret == ZS_PUSH_FULL) {
+ pool->deferred_ops->push(def->buf, 0, value);
+ def->count = 1;
+ }
+ put_cpu_ptr(pool->deferred);
+
+ spin_lock(&pool->deferred_drain_lock);
+ list_add_tail(&full_page->lru, &pool->deferred_drain_list);
+ spin_unlock(&pool->deferred_drain_lock);
+ queue_work(pool->deferred_wq, &pool->deferred_work);
+ return true;
+ }
+ put_cpu_ptr(pool->deferred);
+
+ /* ret==2: value already queued, will be drained eventually */
+ if (ret == 2)
+ return true;
+
+ /* ret==1: value not queued, caller must fallback */
+ return false;
+}
+EXPORT_SYMBOL_GPL(zs_free_deferred);
+
+int zs_pool_enable_deferred_free(struct zs_pool *pool,
+ const struct zs_deferred_ops *ops,
+ void *private)
+{
+ int cpu;
+ unsigned int pg_idx;
+ struct page *page, *tmp;
+
+ pool->deferred_ops = ops;
+ pool->deferred_private = private;
+
+ INIT_WORK(&pool->deferred_work, zs_deferred_work_fn);
+ pool->deferred_wq = alloc_workqueue("zs_drain", WQ_UNBOUND, 0);
+ if (!pool->deferred_wq)
+ return -ENOMEM;
+
+ INIT_LIST_HEAD(&pool->deferred_pool);
+ spin_lock_init(&pool->deferred_pool_lock);
+ pool->deferred_pool_count = 0;
+ INIT_LIST_HEAD(&pool->deferred_drain_list);
+ spin_lock_init(&pool->deferred_drain_lock);
+
+ for (pg_idx = 0; pg_idx < ZS_DEFERRED_POOL_SIZE; pg_idx++) {
+ page = alloc_page(GFP_KERNEL);
+ if (!page)
+ goto err_pages;
+ list_add_tail(&page->lru, &pool->deferred_pool);
+ pool->deferred_pool_count++;
+ }
+
+ pool->deferred = alloc_percpu(struct zs_deferred_percpu);
+ if (!pool->deferred)
+ goto err_pages;
+
+ for_each_possible_cpu(cpu) {
+ struct zs_deferred_percpu *def = per_cpu_ptr(pool->deferred, cpu);
+
+ page = deferred_pool_get(pool);
+ if (!page)
+ goto err_percpu;
+ def->buf = page_address(page);
+ def->count = 0;
+ }
+
+ return 0;
+
+err_percpu:
+ for_each_possible_cpu(cpu) {
+ struct zs_deferred_percpu *def = per_cpu_ptr(pool->deferred, cpu);
+
+ if (def->buf)
+ deferred_pool_put(pool, virt_to_page(def->buf));
+ }
+ free_percpu(pool->deferred);
+ pool->deferred = NULL;
+err_pages:
+ list_for_each_entry_safe(page, tmp, &pool->deferred_pool, lru) {
+ list_del(&page->lru);
+ __free_page(page);
+ }
+ destroy_workqueue(pool->deferred_wq);
+ pool->deferred_wq = NULL;
+ return -ENOMEM;
+}
+EXPORT_SYMBOL_GPL(zs_pool_enable_deferred_free);
+
static void zs_object_copy(struct size_class *class, unsigned long dst,
unsigned long src)
{
@@ -2182,9 +2366,31 @@ EXPORT_SYMBOL_GPL(zs_create_pool);
void zs_destroy_pool(struct zs_pool *pool)
{
- int i;
+ int i, cpu;
+ struct page *page, *tmp;
zs_unregister_shrinker(pool);
+
+ if (pool->deferred) {
+ flush_work(&pool->deferred_work);
+ for_each_possible_cpu(cpu) {
+ struct zs_deferred_percpu *def =
+ per_cpu_ptr(pool->deferred, cpu);
+
+ if (def->buf && def->count)
+ pool->deferred_ops->drain(pool->deferred_private,
+ def->buf, def->count);
+ if (def->buf)
+ deferred_pool_put(pool, virt_to_page(def->buf));
+ }
+ free_percpu(pool->deferred);
+ list_for_each_entry_safe(page, tmp, &pool->deferred_pool, lru) {
+ list_del(&page->lru);
+ __free_page(page);
+ }
+ destroy_workqueue(pool->deferred_wq);
+ }
+
zs_flush_migration(pool);
zs_pool_stat_destroy(pool);
--
2.34.1
next prev parent reply other threads:[~2026-05-08 6:08 UTC|newest]
Thread overview: 12+ messages / expand[flat|nested] mbox.gz Atom feed top
2026-05-08 6:07 [RFC PATCH v3 0/4] mm/zsmalloc: per-cpu deferred free to accelerate swap entry release Wenchao Hao
2026-05-08 6:07 ` Wenchao Hao [this message]
2026-05-09 0:29 ` [RFC PATCH v3 1/4] mm/zsmalloc: introduce deferred free framework with callback ops Nhat Pham
2026-05-09 8:47 ` Wenchao Hao
2026-05-08 6:07 ` [RFC PATCH v3 2/4] mm/zswap: use zsmalloc deferred free callback for async invalidate Wenchao Hao
2026-05-08 6:07 ` [RFC PATCH v3 3/4] zram: use zsmalloc deferred free callback for async slot free Wenchao Hao
2026-05-08 6:07 ` [RFC PATCH v3 4/4] zram: batch clear flags in slot_free with single write Wenchao Hao
2026-05-08 20:12 ` [RFC PATCH v3 0/4] mm/zsmalloc: per-cpu deferred free to accelerate swap entry release Yosry Ahmed
2026-05-09 8:32 ` Wenchao Hao
2026-05-09 8:38 ` Wenchao Hao
2026-05-09 0:08 ` Nhat Pham
2026-05-09 8:45 ` Wenchao Hao
Reply instructions:
You may reply publicly to this message via plain-text email
using any one of the following methods:
* Save the following mbox file, import it into your mail client,
and reply-to-all from there: mbox
Avoid top-posting and favor interleaved quoting:
https://en.wikipedia.org/wiki/Posting_style#Interleaved_style
* Reply using the --to, --cc, and --in-reply-to
switches of git-send-email(1):
git send-email \
--in-reply-to=20260508060724.3810904-2-haowenchao@xiaomi.com \
--to=haowenchao22@gmail.com \
--cc=21cnbao@gmail.com \
--cc=akpm@linux-foundation.org \
--cc=axboe@kernel.dk \
--cc=chengming.zhou@linux.dev \
--cc=hannes@cmpxchg.org \
--cc=haowenchao@xiaomi.com \
--cc=linux-block@vger.kernel.org \
--cc=linux-kernel@vger.kernel.org \
--cc=linux-mm@kvack.org \
--cc=minchan@kernel.org \
--cc=nphamcs@gmail.com \
--cc=senozhatsky@chromium.org \
--cc=yosry@kernel.org \
/path/to/YOUR_REPLY
https://kernel.org/pub/software/scm/git/docs/git-send-email.html
* If your mail client supports setting the In-Reply-To header
via mailto: links, try the mailto: link
Be sure your reply has a Subject: header at the top and a blank line
before the message body.
This is a public inbox, see mirroring instructions
for how to clone and mirror all data and code used for this inbox