* + zram-introduce-writeback-bio-batching-support.patch added to mm-new branch
@ 2025-11-13 22:58 Andrew Morton
0 siblings, 0 replies; only message in thread
From: Andrew Morton @ 2025-11-13 22:58 UTC (permalink / raw)
To: mm-commits, senozhatsky, richardycc, minchan, bgeffon, ywen.chen,
akpm
The patch titled
Subject: zram: introduce writeback bio batching support
has been added to the -mm mm-new branch. Its filename is
zram-introduce-writeback-bio-batching-support.patch
This patch will shortly appear at
https://git.kernel.org/pub/scm/linux/kernel/git/akpm/25-new.git/tree/patches/zram-introduce-writeback-bio-batching-support.patch
This patch will later appear in the mm-new branch at
git://git.kernel.org/pub/scm/linux/kernel/git/akpm/mm
Note, mm-new is a provisional staging ground for work-in-progress
patches, and acceptance into mm-new is a notification for others take
notice and to finish up reviews. Please do not hesitate to respond to
review feedback and post updated versions to replace or incrementally
fixup patches in mm-new.
Before you just go and hit "reply", please:
a) Consider who else should be cc'ed
b) Prefer to cc a suitable mailing list as well
c) Ideally: find the original patch on the mailing list and do a
reply-to-all to that, adding suitable additional cc's
*** Remember to use Documentation/process/submit-checklist.rst when testing your code ***
The -mm tree is included into linux-next via the mm-everything
branch at git://git.kernel.org/pub/scm/linux/kernel/git/akpm/mm
and is updated there every 2-3 working days
------------------------------------------------------
From: Yuwen Chen <ywen.chen@foxmail.com>
Subject: zram: introduce writeback bio batching support
Date: Thu, 13 Nov 2025 17:53:59 +0900
Currently, zram writeback supports only a single bio writeback operation,
waiting for bio completion before post-processing next pp-slot. This
works, in general, but has certain throughput limitations. Implement
batched (multiple) bio writeback support to take advantage of parallel
requests processing and better requests scheduling.
For the time being the writeback batch size (maximum number of in-flight
bio requests) is set to 1, so the behaviors is the same as the previous
single-bio writeback. This is addressed in a follow up patch, which adds
a writeback_batch_size device attribute.
Please refer to [1] and [2] for benchmarks:
: After applying this patch, a large number of pages being merged
: into batch write operations can be observed via the following test
: code, which effectively improves write-back performance.
:
: We used the following instructions to conduct a performance test
: on the write-back function of zram in the QEMU environment.
: $ echo "/dev/sdb" > /sys/block/zram0/backing_dev
: $ echo "1024000000" > /sys/block/zram0/disksize
: $ dd if=/dev/random of=/dev/zram0
: $ time echo "page_indexes=1-100000" > /sys/block/zram0/writeback
:
: before modification:
: real 0m 16.62s
: user 0m 0.00s
: sys 0m 5.98s
:
: real 0m 15.38s
: user 0m 0.00s
: sys 0m 5.31s
:
: real 0m 15.58s
: user 0m 0.00s
: sys 0m 5.49s
:
: after modification:
: real 0m 1.36s
: user 0m 0.00s
: sys 0m 1.13s
:
: real 0m 1.36s
: user 0m 0.00s
: sys 0m 1.11s
:
: real 0m 1.39s
: user 0m 0.00s
: sys 0m 1.16s
[senozhatsky@chromium.org: significantly reworked the initial patch so
that the approach and implementation resemble current zram
post-processing code]
Link: https://lkml.kernel.org/r/20251113085402.1811522-1-senozhatsky@chromium.org
Link: https://lkml.kernel.org/r/20251113085402.1811522-2-senozhatsky@chromium.org
Link: https://lore.kernel.org/linux-block/tencent_B2DC37E3A2AED0E7F179365FCB5D82455B08@qq.com [1]
Link :https://lore.kernel.org/linux-block/tencent_0FBBFC8AE0B97BC63B5D47CE1FF2BABFDA09@qq.com [2]
Signed-off-by: Yuwen Chen <ywen.chen@foxmail.com>
Signed-off-by: Sergey Senozhatsky <senozhatsky@chromium.org>
Co-developed-by: Richard Chang <richardycc@google.com>
Suggested-by: Minchan Kim <minchan@google.com>
Cc: Brian Geffon <bgeffon@google.com>
Signed-off-by: Andrew Morton <akpm@linux-foundation.org>
---
drivers/block/zram/zram_drv.c | 343 +++++++++++++++++++++++++-------
1 file changed, 278 insertions(+), 65 deletions(-)
--- a/drivers/block/zram/zram_drv.c~zram-introduce-writeback-bio-batching-support
+++ a/drivers/block/zram/zram_drv.c
@@ -734,20 +734,226 @@ static void read_from_bdev_async(struct
submit_bio(bio);
}
-static int zram_writeback_slots(struct zram *zram, struct zram_pp_ctl *ctl)
-{
- unsigned long blk_idx = 0;
- struct page *page = NULL;
+struct zram_wb_ctl {
+ struct list_head idle_reqs;
+ struct list_head inflight_reqs;
+
+ atomic_t num_inflight;
+ struct completion done;
+ struct blk_plug plug;
+};
+
+struct zram_wb_req {
+ unsigned long blk_idx;
+ struct page *page;
struct zram_pp_slot *pps;
struct bio_vec bio_vec;
struct bio bio;
- int ret = 0, err;
+
+ struct list_head entry;
+};
+
+static void release_wb_req(struct zram_wb_req *req)
+{
+ __free_page(req->page);
+ kfree(req);
+}
+
+static void release_wb_ctl(struct zram_wb_ctl *wb_ctl)
+{
+ /* We should never have inflight requests at this point */
+ WARN_ON(!list_empty(&wb_ctl->inflight_reqs));
+
+ while (!list_empty(&wb_ctl->idle_reqs)) {
+ struct zram_wb_req *req;
+
+ req = list_first_entry(&wb_ctl->idle_reqs,
+ struct zram_wb_req, entry);
+ list_del(&req->entry);
+ release_wb_req(req);
+ }
+
+ kfree(wb_ctl);
+}
+
+/* XXX: should be a per-device sysfs attr */
+#define ZRAM_WB_REQ_CNT 1
+
+static struct zram_wb_ctl *init_wb_ctl(void)
+{
+ struct zram_wb_ctl *wb_ctl;
+ int i;
+
+ wb_ctl = kmalloc(sizeof(*wb_ctl), GFP_KERNEL);
+ if (!wb_ctl)
+ return NULL;
+
+ INIT_LIST_HEAD(&wb_ctl->idle_reqs);
+ INIT_LIST_HEAD(&wb_ctl->inflight_reqs);
+ atomic_set(&wb_ctl->num_inflight, 0);
+ init_completion(&wb_ctl->done);
+
+ for (i = 0; i < ZRAM_WB_REQ_CNT; i++) {
+ struct zram_wb_req *req;
+
+ /*
+ * This is fatal condition only if we couldn't allocate
+ * any requests at all. Otherwise we just work with the
+ * requests that we have successfully allocated, so that
+ * writeback can still proceed, even if there is only one
+ * request on the idle list.
+ */
+ req = kzalloc(sizeof(*req), GFP_NOIO | __GFP_NOWARN);
+ if (!req)
+ break;
+
+ req->page = alloc_page(GFP_NOIO | __GFP_NOWARN);
+ if (!req->page) {
+ kfree(req);
+ break;
+ }
+
+ INIT_LIST_HEAD(&req->entry);
+ list_add(&req->entry, &wb_ctl->idle_reqs);
+ }
+
+ /* We couldn't allocate any requests, so writeabck is not possible */
+ if (list_empty(&wb_ctl->idle_reqs))
+ goto release_wb_ctl;
+
+ return wb_ctl;
+
+release_wb_ctl:
+ release_wb_ctl(wb_ctl);
+ return NULL;
+}
+
+static void zram_account_writeback_rollback(struct zram *zram)
+{
+ spin_lock(&zram->wb_limit_lock);
+ if (zram->wb_limit_enable)
+ zram->bd_wb_limit += 1UL << (PAGE_SHIFT - 12);
+ spin_unlock(&zram->wb_limit_lock);
+}
+
+static void zram_account_writeback_submit(struct zram *zram)
+{
+ spin_lock(&zram->wb_limit_lock);
+ if (zram->wb_limit_enable && zram->bd_wb_limit > 0)
+ zram->bd_wb_limit -= 1UL << (PAGE_SHIFT - 12);
+ spin_unlock(&zram->wb_limit_lock);
+}
+
+static int zram_writeback_complete(struct zram *zram, struct zram_wb_req *req)
+{
u32 index;
+ int err;
- page = alloc_page(GFP_KERNEL);
- if (!page)
- return -ENOMEM;
+ index = req->pps->index;
+ release_pp_slot(zram, req->pps);
+ req->pps = NULL;
+ err = blk_status_to_errno(req->bio.bi_status);
+ if (err) {
+ /*
+ * Failed wb requests should not be accounted in wb_limit
+ * (if enabled).
+ */
+ zram_account_writeback_rollback(zram);
+ return err;
+ }
+
+ atomic64_inc(&zram->stats.bd_writes);
+ zram_slot_lock(zram, index);
+ /*
+ * We release slot lock during writeback so slot can change under us:
+ * slot_free() or slot_free() and zram_write_page(). In both cases
+ * slot loses ZRAM_PP_SLOT flag. No concurrent post-processing can
+ * set ZRAM_PP_SLOT on such slots until current post-processing
+ * finishes.
+ */
+ if (!zram_test_flag(zram, index, ZRAM_PP_SLOT))
+ goto out;
+
+ zram_free_page(zram, index);
+ zram_set_flag(zram, index, ZRAM_WB);
+ zram_set_handle(zram, index, req->blk_idx);
+ atomic64_inc(&zram->stats.pages_stored);
+
+out:
+ zram_slot_unlock(zram, index);
+ return 0;
+}
+
+static void zram_writeback_endio(struct bio *bio)
+{
+ struct zram_wb_ctl *wb_ctl = bio->bi_private;
+
+ if (atomic_dec_return(&wb_ctl->num_inflight) == 0)
+ complete(&wb_ctl->done);
+}
+
+static void zram_submit_wb_request(struct zram *zram,
+ struct zram_wb_ctl *wb_ctl,
+ struct zram_wb_req *req)
+{
+ /*
+ * wb_limit (if enabled) should be adjusted before submission,
+ * so that we don't over-submit.
+ */
+ zram_account_writeback_submit(zram);
+ atomic_inc(&wb_ctl->num_inflight);
+ list_add_tail(&req->entry, &wb_ctl->inflight_reqs);
+ submit_bio(&req->bio);
+}
+
+static struct zram_wb_req *select_idle_req(struct zram_wb_ctl *wb_ctl)
+{
+ struct zram_wb_req *req;
+
+ req = list_first_entry_or_null(&wb_ctl->idle_reqs,
+ struct zram_wb_req, entry);
+ if (req)
+ list_del(&req->entry);
+ return req;
+}
+
+static int zram_wb_wait_for_completion(struct zram *zram,
+ struct zram_wb_ctl *wb_ctl)
+{
+ int ret = 0;
+
+ if (atomic_read(&wb_ctl->num_inflight))
+ wait_for_completion_io(&wb_ctl->done);
+
+ reinit_completion(&wb_ctl->done);
+ while (!list_empty(&wb_ctl->inflight_reqs)) {
+ struct zram_wb_req *req;
+ int err;
+
+ req = list_first_entry(&wb_ctl->inflight_reqs,
+ struct zram_wb_req, entry);
+ list_move(&req->entry, &wb_ctl->idle_reqs);
+
+ err = zram_writeback_complete(zram, req);
+ if (err)
+ ret = err;
+ }
+
+ return ret;
+}
+
+static int zram_writeback_slots(struct zram *zram,
+ struct zram_pp_ctl *ctl,
+ struct zram_wb_ctl *wb_ctl)
+{
+ struct zram_wb_req *req = NULL;
+ unsigned long blk_idx = 0;
+ struct zram_pp_slot *pps;
+ int ret = 0, err;
+ u32 index = 0;
+
+ blk_start_plug(&wb_ctl->plug);
while ((pps = select_pp_slot(ctl))) {
spin_lock(&zram->wb_limit_lock);
if (zram->wb_limit_enable && !zram->bd_wb_limit) {
@@ -757,6 +963,26 @@ static int zram_writeback_slots(struct z
}
spin_unlock(&zram->wb_limit_lock);
+ while (!req) {
+ req = select_idle_req(wb_ctl);
+ if (req)
+ break;
+
+ blk_finish_plug(&wb_ctl->plug);
+ err = zram_wb_wait_for_completion(zram, wb_ctl);
+ blk_start_plug(&wb_ctl->plug);
+ /*
+ * BIO errors are not fatal, we continue and simply
+ * attempt to writeback the remaining objects (pages).
+ * At the same time we need to signal user-space that
+ * some writes (at least one, but also could be all of
+ * them) were not successful and we do so by returning
+ * the most recent BIO error.
+ */
+ if (err)
+ ret = err;
+ }
+
if (!blk_idx) {
blk_idx = alloc_block_bdev(zram);
if (!blk_idx) {
@@ -765,7 +991,6 @@ static int zram_writeback_slots(struct z
}
}
- index = pps->index;
zram_slot_lock(zram, index);
/*
* scan_slots() sets ZRAM_PP_SLOT and relases slot lock, so
@@ -775,67 +1000,47 @@ static int zram_writeback_slots(struct z
*/
if (!zram_test_flag(zram, index, ZRAM_PP_SLOT))
goto next;
- if (zram_read_from_zspool(zram, page, index))
+ if (zram_read_from_zspool(zram, req->page, index))
goto next;
zram_slot_unlock(zram, index);
- bio_init(&bio, zram->bdev, &bio_vec, 1,
- REQ_OP_WRITE | REQ_SYNC);
- bio.bi_iter.bi_sector = blk_idx * (PAGE_SIZE >> 9);
- __bio_add_page(&bio, page, PAGE_SIZE, 0);
-
/*
- * XXX: A single page IO would be inefficient for write
- * but it would be not bad as starter.
+ * From now on pp-slot is owned by the req, remove it from
+ * its pps bucket.
*/
- err = submit_bio_wait(&bio);
- if (err) {
- release_pp_slot(zram, pps);
- /*
- * BIO errors are not fatal, we continue and simply
- * attempt to writeback the remaining objects (pages).
- * At the same time we need to signal user-space that
- * some writes (at least one, but also could be all of
- * them) were not successful and we do so by returning
- * the most recent BIO error.
- */
- ret = err;
- continue;
- }
+ list_del_init(&pps->entry);
- atomic64_inc(&zram->stats.bd_writes);
- zram_slot_lock(zram, index);
- /*
- * Same as above, we release slot lock during writeback so
- * slot can change under us: slot_free() or slot_free() and
- * reallocation (zram_write_page()). In both cases slot loses
- * ZRAM_PP_SLOT flag. No concurrent post-processing can set
- * ZRAM_PP_SLOT on such slots until current post-processing
- * finishes.
- */
- if (!zram_test_flag(zram, index, ZRAM_PP_SLOT))
- goto next;
+ req->blk_idx = blk_idx;
+ req->pps = pps;
+ bio_init(&req->bio, zram->bdev, &req->bio_vec, 1,
+ REQ_OP_WRITE | REQ_SYNC);
+ req->bio.bi_iter.bi_sector = req->blk_idx * (PAGE_SIZE >> 9);
+ req->bio.bi_end_io = zram_writeback_endio;
+ req->bio.bi_private = wb_ctl;
+ __bio_add_page(&req->bio, req->page, PAGE_SIZE, 0);
- zram_free_page(zram, index);
- zram_set_flag(zram, index, ZRAM_WB);
- zram_set_handle(zram, index, blk_idx);
+ zram_submit_wb_request(zram, wb_ctl, req);
blk_idx = 0;
- atomic64_inc(&zram->stats.pages_stored);
- spin_lock(&zram->wb_limit_lock);
- if (zram->wb_limit_enable && zram->bd_wb_limit > 0)
- zram->bd_wb_limit -= 1UL << (PAGE_SHIFT - 12);
- spin_unlock(&zram->wb_limit_lock);
+ req = NULL;
+ continue;
+
next:
zram_slot_unlock(zram, index);
release_pp_slot(zram, pps);
-
cond_resched();
}
- if (blk_idx)
- free_block_bdev(zram, blk_idx);
- if (page)
- __free_page(page);
+ /*
+ * Selected idle req, but never submitted it due to some error or
+ * wb limit.
+ */
+ if (req)
+ release_wb_req(req);
+
+ blk_finish_plug(&wb_ctl->plug);
+ err = zram_wb_wait_for_completion(zram, wb_ctl);
+ if (err)
+ ret = err;
return ret;
}
@@ -948,7 +1153,8 @@ static ssize_t writeback_store(struct de
struct zram *zram = dev_to_zram(dev);
u64 nr_pages = zram->disksize >> PAGE_SHIFT;
unsigned long lo = 0, hi = nr_pages;
- struct zram_pp_ctl *ctl = NULL;
+ struct zram_pp_ctl *pp_ctl = NULL;
+ struct zram_wb_ctl *wb_ctl = NULL;
char *args, *param, *val;
ssize_t ret = len;
int err, mode = 0;
@@ -970,8 +1176,14 @@ static ssize_t writeback_store(struct de
goto release_init_lock;
}
- ctl = init_pp_ctl();
- if (!ctl) {
+ pp_ctl = init_pp_ctl();
+ if (!pp_ctl) {
+ ret = -ENOMEM;
+ goto release_init_lock;
+ }
+
+ wb_ctl = init_wb_ctl();
+ if (!wb_ctl) {
ret = -ENOMEM;
goto release_init_lock;
}
@@ -1000,7 +1212,7 @@ static ssize_t writeback_store(struct de
goto release_init_lock;
}
- scan_slots_for_writeback(zram, mode, lo, hi, ctl);
+ scan_slots_for_writeback(zram, mode, lo, hi, pp_ctl);
break;
}
@@ -1011,7 +1223,7 @@ static ssize_t writeback_store(struct de
goto release_init_lock;
}
- scan_slots_for_writeback(zram, mode, lo, hi, ctl);
+ scan_slots_for_writeback(zram, mode, lo, hi, pp_ctl);
break;
}
@@ -1022,7 +1234,7 @@ static ssize_t writeback_store(struct de
goto release_init_lock;
}
- scan_slots_for_writeback(zram, mode, lo, hi, ctl);
+ scan_slots_for_writeback(zram, mode, lo, hi, pp_ctl);
continue;
}
@@ -1033,17 +1245,18 @@ static ssize_t writeback_store(struct de
goto release_init_lock;
}
- scan_slots_for_writeback(zram, mode, lo, hi, ctl);
+ scan_slots_for_writeback(zram, mode, lo, hi, pp_ctl);
continue;
}
}
- err = zram_writeback_slots(zram, ctl);
+ err = zram_writeback_slots(zram, pp_ctl, wb_ctl);
if (err)
ret = err;
release_init_lock:
- release_pp_ctl(zram, ctl);
+ release_pp_ctl(zram, pp_ctl);
+ release_wb_ctl(wb_ctl);
atomic_set(&zram->pp_in_progress, 0);
up_read(&zram->init_lock);
_
Patches currently in -mm which might be from ywen.chen@foxmail.com are
zram-introduce-writeback-bio-batching-support.patch
^ permalink raw reply [flat|nested] only message in thread
only message in thread, other threads:[~2025-11-13 22:58 UTC | newest]
Thread overview: (only message) (download: mbox.gz follow: Atom feed
-- links below jump to the message on this page --
2025-11-13 22:58 + zram-introduce-writeback-bio-batching-support.patch added to mm-new branch Andrew Morton
This is an external index of several public inboxes,
see mirroring instructions on how to clone and mirror
all data and code used by this external index.