* add a "discard cache" debug option to zloop @ 2026-03-18 5:53 Christoph Hellwig 2026-03-18 5:53 ` [PATCH 1/2] zloop: refactor zloop_rw Christoph Hellwig 2026-03-18 5:53 ` [PATCH 2/2] zloop: forget write cache on force removal Christoph Hellwig 0 siblings, 2 replies; 8+ messages in thread From: Christoph Hellwig @ 2026-03-18 5:53 UTC (permalink / raw) To: Damien Le Moal, Jens Axboe; +Cc: linux-block Hi all, this series adds a new option to zloop to lose data not committed to stable storage using a flush operation on device removal. The idea is to help testing that file system code does the right thing in face of volatile write caches. For conventional devices, this can be tested using dm-log-writes, but the concepts there don't work for sequential write required zones. Instead this adds an option to zloop, which records the write pointer at the last cache flush for each zone file in an xattr, and truncates the files down to that value on removal, simulating losing the contents of the volatile write cache. Diffstat: Documentation/admin-guide/blockdev/zoned_loop.rst | 5 drivers/block/zloop.c | 337 ++++++++++++++-------- 2 files changed, 226 insertions(+), 116 deletions(-) ^ permalink raw reply [flat|nested] 8+ messages in thread
* [PATCH 1/2] zloop: refactor zloop_rw 2026-03-18 5:53 add a "discard cache" debug option to zloop Christoph Hellwig @ 2026-03-18 5:53 ` Christoph Hellwig 2026-03-18 6:58 ` Damien Le Moal 2026-03-18 5:53 ` [PATCH 2/2] zloop: forget write cache on force removal Christoph Hellwig 1 sibling, 1 reply; 8+ messages in thread From: Christoph Hellwig @ 2026-03-18 5:53 UTC (permalink / raw) To: Damien Le Moal, Jens Axboe; +Cc: linux-block Split out two helpers functions to make the function more readable and to avoid conditional locking. Signed-off-by: Christoph Hellwig <hch@lst.de> --- drivers/block/zloop.c | 240 ++++++++++++++++++++++-------------------- 1 file changed, 124 insertions(+), 116 deletions(-) diff --git a/drivers/block/zloop.c b/drivers/block/zloop.c index 51c043342127..8ca37ca1935a 100644 --- a/drivers/block/zloop.c +++ b/drivers/block/zloop.c @@ -378,125 +378,22 @@ static void zloop_rw_complete(struct kiocb *iocb, long ret) zloop_put_cmd(cmd); } -static void zloop_rw(struct zloop_cmd *cmd) +static int zloop_do_rw(struct zloop_cmd *cmd) { struct request *rq = blk_mq_rq_from_pdu(cmd); + int rw = req_op(rq) == REQ_OP_READ ? ITER_DEST : ITER_SOURCE; + unsigned int nr_bvec = blk_rq_nr_bvec(rq); struct zloop_device *zlo = rq->q->queuedata; - unsigned int zone_no = rq_zone_no(rq); - sector_t sector = blk_rq_pos(rq); - sector_t nr_sectors = blk_rq_sectors(rq); - bool is_append = req_op(rq) == REQ_OP_ZONE_APPEND; - bool is_write = req_op(rq) == REQ_OP_WRITE || is_append; - int rw = is_write ? ITER_SOURCE : ITER_DEST; + struct zloop_zone *zone = &zlo->zones[rq_zone_no(rq)]; struct req_iterator rq_iter; - struct zloop_zone *zone; struct iov_iter iter; - struct bio_vec tmp; - unsigned long flags; - sector_t zone_end; - unsigned int nr_bvec; - int ret; - - atomic_set(&cmd->ref, 2); - cmd->sector = sector; - cmd->nr_sectors = nr_sectors; - cmd->ret = 0; - - if (WARN_ON_ONCE(is_append && !zlo->zone_append)) { - ret = -EIO; - goto out; - } - - /* We should never get an I/O beyond the device capacity. */ - if (WARN_ON_ONCE(zone_no >= zlo->nr_zones)) { - ret = -EIO; - goto out; - } - zone = &zlo->zones[zone_no]; - zone_end = zone->start + zlo->zone_capacity; - - /* - * The block layer should never send requests that are not fully - * contained within the zone. - */ - if (WARN_ON_ONCE(sector + nr_sectors > zone->start + zlo->zone_size)) { - ret = -EIO; - goto out; - } - - if (test_and_clear_bit(ZLOOP_ZONE_SEQ_ERROR, &zone->flags)) { - mutex_lock(&zone->lock); - ret = zloop_update_seq_zone(zlo, zone_no); - mutex_unlock(&zone->lock); - if (ret) - goto out; - } - - if (!test_bit(ZLOOP_ZONE_CONV, &zone->flags) && is_write) { - mutex_lock(&zone->lock); - - spin_lock_irqsave(&zone->wp_lock, flags); - - /* - * Zone append operations always go at the current write - * pointer, but regular write operations must already be - * aligned to the write pointer when submitted. - */ - if (is_append) { - /* - * If ordered zone append is in use, we already checked - * and set the target sector in zloop_queue_rq(). - */ - if (!zlo->ordered_zone_append) { - if (zone->cond == BLK_ZONE_COND_FULL || - zone->wp + nr_sectors > zone_end) { - spin_unlock_irqrestore(&zone->wp_lock, - flags); - ret = -EIO; - goto unlock; - } - sector = zone->wp; - } - cmd->sector = sector; - } else if (sector != zone->wp) { - spin_unlock_irqrestore(&zone->wp_lock, flags); - pr_err("Zone %u: unaligned write: sect %llu, wp %llu\n", - zone_no, sector, zone->wp); - ret = -EIO; - goto unlock; - } - - /* Implicitly open the target zone. */ - if (zone->cond == BLK_ZONE_COND_CLOSED || - zone->cond == BLK_ZONE_COND_EMPTY) - zone->cond = BLK_ZONE_COND_IMP_OPEN; - - /* - * Advance the write pointer, unless ordered zone append is in - * use. If the write fails, the write pointer position will be - * corrected when the next I/O starts execution. - */ - if (!is_append || !zlo->ordered_zone_append) { - zone->wp += nr_sectors; - if (zone->wp == zone_end) { - zone->cond = BLK_ZONE_COND_FULL; - zone->wp = ULLONG_MAX; - } - } - - spin_unlock_irqrestore(&zone->wp_lock, flags); - } - - nr_bvec = blk_rq_nr_bvec(rq); if (rq->bio != rq->biotail) { - struct bio_vec *bvec; + struct bio_vec tmp, *bvec; cmd->bvec = kmalloc_objs(*cmd->bvec, nr_bvec, GFP_NOIO); - if (!cmd->bvec) { - ret = -EIO; - goto unlock; - } + if (!cmd->bvec) + return -EIO; /* * The bios of the request may be started from the middle of @@ -522,7 +419,7 @@ static void zloop_rw(struct zloop_cmd *cmd) iter.iov_offset = rq->bio->bi_iter.bi_bvec_done; } - cmd->iocb.ki_pos = (sector - zone->start) << SECTOR_SHIFT; + cmd->iocb.ki_pos = (cmd->sector - zone->start) << SECTOR_SHIFT; cmd->iocb.ki_filp = zone->file; cmd->iocb.ki_complete = zloop_rw_complete; if (!zlo->buffered_io) @@ -530,12 +427,123 @@ static void zloop_rw(struct zloop_cmd *cmd) cmd->iocb.ki_ioprio = IOPRIO_PRIO_VALUE(IOPRIO_CLASS_NONE, 0); if (rw == ITER_SOURCE) - ret = zone->file->f_op->write_iter(&cmd->iocb, &iter); - else - ret = zone->file->f_op->read_iter(&cmd->iocb, &iter); -unlock: - if (!test_bit(ZLOOP_ZONE_CONV, &zone->flags) && is_write) + return zone->file->f_op->write_iter(&cmd->iocb, &iter); + return zone->file->f_op->read_iter(&cmd->iocb, &iter); +} + +static int zloop_seq_write_prep(struct zloop_cmd *cmd) +{ + struct request *rq = blk_mq_rq_from_pdu(cmd); + struct zloop_device *zlo = rq->q->queuedata; + unsigned int zone_no = rq_zone_no(rq); + sector_t nr_sectors = blk_rq_sectors(rq); + bool is_append = req_op(rq) == REQ_OP_ZONE_APPEND; + struct zloop_zone *zone = &zlo->zones[zone_no]; + sector_t zone_end = zone->start + zlo->zone_capacity; + unsigned long flags; + int ret = 0; + + spin_lock_irqsave(&zone->wp_lock, flags); + + /* + * Zone append operations always go at the current write pointer, but + * regular write operations must already be aligned to the write pointer + * when submitted. + */ + if (is_append) { + /* + * If ordered zone append is in use, we already checked and set + * the target sector in zloop_queue_rq(). + */ + if (!zlo->ordered_zone_append) { + if (zone->cond == BLK_ZONE_COND_FULL || + zone->wp + nr_sectors > zone_end) { + ret = -EIO; + goto out_unlock; + } + cmd->sector = zone->wp; + } + } else { + if (cmd->sector != zone->wp) { + pr_err("Zone %u: unaligned write: sect %llu, wp %llu\n", + zone_no, cmd->sector, zone->wp); + ret = -EIO; + goto out_unlock; + } + } + + /* Implicitly open the target zone. */ + if (zone->cond == BLK_ZONE_COND_CLOSED || + zone->cond == BLK_ZONE_COND_EMPTY) + zone->cond = BLK_ZONE_COND_IMP_OPEN; + + /* + * Advance the write pointer, unless ordered zone append is in use. If + * the write fails, the write pointer position will be corrected when + * the next I/O starts execution. + */ + if (!is_append || !zlo->ordered_zone_append) { + zone->wp += nr_sectors; + if (zone->wp == zone_end) { + zone->cond = BLK_ZONE_COND_FULL; + zone->wp = ULLONG_MAX; + } + } +out_unlock: + spin_unlock_irqrestore(&zone->wp_lock, flags); + return ret; +} + +static void zloop_rw(struct zloop_cmd *cmd) +{ + struct request *rq = blk_mq_rq_from_pdu(cmd); + struct zloop_device *zlo = rq->q->queuedata; + unsigned int zone_no = rq_zone_no(rq); + sector_t nr_sectors = blk_rq_sectors(rq); + bool is_append = req_op(rq) == REQ_OP_ZONE_APPEND; + bool is_write = req_op(rq) == REQ_OP_WRITE || is_append; + struct zloop_zone *zone; + int ret = -EIO; + + atomic_set(&cmd->ref, 2); + cmd->sector = blk_rq_pos(rq); + cmd->nr_sectors = nr_sectors; + cmd->ret = 0; + + if (WARN_ON_ONCE(is_append && !zlo->zone_append)) + goto out; + + /* We should never get an I/O beyond the device capacity. */ + if (WARN_ON_ONCE(zone_no >= zlo->nr_zones)) + goto out; + + zone = &zlo->zones[zone_no]; + + /* + * The block layer should never send requests that are not fully + * contained within the zone. + */ + if (WARN_ON_ONCE(cmd->sector + nr_sectors > + zone->start + zlo->zone_size)) + goto out; + + if (test_and_clear_bit(ZLOOP_ZONE_SEQ_ERROR, &zone->flags)) { + mutex_lock(&zone->lock); + ret = zloop_update_seq_zone(zlo, zone_no); mutex_unlock(&zone->lock); + if (ret) + goto out; + } + + if (!test_bit(ZLOOP_ZONE_CONV, &zone->flags) && is_write) { + mutex_lock(&zone->lock); + ret = zloop_seq_write_prep(cmd); + if (!ret) + ret = zloop_do_rw(cmd); + mutex_unlock(&zone->lock); + } else { + ret = zloop_do_rw(cmd); + } out: if (ret != -EIOCBQUEUED) zloop_rw_complete(&cmd->iocb, ret); -- 2.47.3 ^ permalink raw reply related [flat|nested] 8+ messages in thread
* Re: [PATCH 1/2] zloop: refactor zloop_rw 2026-03-18 5:53 ` [PATCH 1/2] zloop: refactor zloop_rw Christoph Hellwig @ 2026-03-18 6:58 ` Damien Le Moal 0 siblings, 0 replies; 8+ messages in thread From: Damien Le Moal @ 2026-03-18 6:58 UTC (permalink / raw) To: Christoph Hellwig, Jens Axboe; +Cc: linux-block On 3/18/26 2:53 PM, Christoph Hellwig wrote: > Split out two helpers functions to make the function more readable and > to avoid conditional locking. > > Signed-off-by: Christoph Hellwig <hch@lst.de> Nice cleanup. Reviewed-by: Damien Le Moal <dlemoal@kernel.org> -- Damien Le Moal Western Digital Research ^ permalink raw reply [flat|nested] 8+ messages in thread
* [PATCH 2/2] zloop: forget write cache on force removal 2026-03-18 5:53 add a "discard cache" debug option to zloop Christoph Hellwig 2026-03-18 5:53 ` [PATCH 1/2] zloop: refactor zloop_rw Christoph Hellwig @ 2026-03-18 5:53 ` Christoph Hellwig 2026-03-18 7:03 ` Damien Le Moal 1 sibling, 1 reply; 8+ messages in thread From: Christoph Hellwig @ 2026-03-18 5:53 UTC (permalink / raw) To: Damien Le Moal, Jens Axboe; +Cc: linux-block Add a new options that causes zloop to truncate the zone files to the write pointer value recorded at the last cache flush to simulate unclean shutdowns. Signed-off-by: Christoph Hellwig <hch@lst.de> --- .../admin-guide/blockdev/zoned_loop.rst | 5 + drivers/block/zloop.c | 97 +++++++++++++++++++ 2 files changed, 102 insertions(+) diff --git a/Documentation/admin-guide/blockdev/zoned_loop.rst b/Documentation/admin-guide/blockdev/zoned_loop.rst index 6aa865424ac3..237ee2fccb82 100644 --- a/Documentation/admin-guide/blockdev/zoned_loop.rst +++ b/Documentation/admin-guide/blockdev/zoned_loop.rst @@ -104,6 +104,11 @@ ordered_zone_append Enable zloop mitigation of zone append reordering. (extents), as when enabled, this can significantly reduce the number of data extents needed to for a file data mapping. +discard_write_cache Discard all data that was not explicitly persisted using a + flush operation when removed by truncating each zone file + to the size recorded during the last flush operation. + This simulates power fail events where uncommitted data is + lost. =================== ========================================================= 3) Deleting a Zoned Device diff --git a/drivers/block/zloop.c b/drivers/block/zloop.c index 8ca37ca1935a..86a1324c27b3 100644 --- a/drivers/block/zloop.c +++ b/drivers/block/zloop.c @@ -17,6 +17,7 @@ #include <linux/mutex.h> #include <linux/parser.h> #include <linux/seq_file.h> +#include <linux/xattr.h> /* * Options for adding (and removing) a device. @@ -34,6 +35,7 @@ enum { ZLOOP_OPT_BUFFERED_IO = (1 << 8), ZLOOP_OPT_ZONE_APPEND = (1 << 9), ZLOOP_OPT_ORDERED_ZONE_APPEND = (1 << 10), + ZLOOP_OPT_DISCARD_WRITE_CACHE = (1 << 11), }; static const match_table_t zloop_opt_tokens = { @@ -48,6 +50,7 @@ static const match_table_t zloop_opt_tokens = { { ZLOOP_OPT_BUFFERED_IO, "buffered_io" }, { ZLOOP_OPT_ZONE_APPEND, "zone_append=%u" }, { ZLOOP_OPT_ORDERED_ZONE_APPEND, "ordered_zone_append" }, + { ZLOOP_OPT_DISCARD_WRITE_CACHE, "discard_write_cache" }, { ZLOOP_OPT_ERR, NULL } }; @@ -79,6 +82,7 @@ struct zloop_options { bool buffered_io; bool zone_append; bool ordered_zone_append; + bool discard_write_cache; }; /* @@ -119,6 +123,7 @@ struct zloop_device { bool buffered_io; bool zone_append; bool ordered_zone_append; + bool discard_write_cache; const char *base_dir; struct file *data_dir; @@ -550,6 +555,41 @@ static void zloop_rw(struct zloop_cmd *cmd) zloop_put_cmd(cmd); } +static inline bool zloop_zone_is_active(struct zloop_zone *zone) +{ + switch (zone->cond) { + case BLK_ZONE_COND_EXP_OPEN: + case BLK_ZONE_COND_IMP_OPEN: + case BLK_ZONE_COND_CLOSED: + return true; + default: + return false; + } +} + +static int zloop_record_safe_wps(struct zloop_device *zlo) +{ + unsigned int i; + int ret; + + for (i = 0; i < zlo->nr_zones; i++) { + struct zloop_zone *zone = &zlo->zones[i]; + struct file *file = zone->file; + + if (!zloop_zone_is_active(zone)) + continue; + ret = vfs_setxattr(file_mnt_idmap(file), file_dentry(file), + "user.zloop.wp", &zone->wp, sizeof(zone->wp), 0); + if (ret) { + pr_err("%pg: failed to record write pointer (%d)\n", + zlo->disk->part0, ret); + return ret; + } + } + + return 0; +} + /* * Sync the entire FS containing the zone files instead of walking all files. */ @@ -558,6 +598,12 @@ static int zloop_flush(struct zloop_device *zlo) struct super_block *sb = file_inode(zlo->data_dir)->i_sb; int ret; + if (zlo->discard_write_cache) { + ret = zloop_record_safe_wps(zlo); + if (ret) + return ret; + } + down_read(&sb->s_umount); ret = sync_filesystem(sb); up_read(&sb->s_umount); @@ -1054,6 +1100,7 @@ static int zloop_ctl_add(struct zloop_options *opts) zlo->zone_append = opts->zone_append; if (zlo->zone_append) zlo->ordered_zone_append = opts->ordered_zone_append; + zlo->discard_write_cache = opts->discard_write_cache; zlo->workqueue = alloc_workqueue("zloop%d", WQ_UNBOUND | WQ_FREEZABLE, opts->nr_queues * opts->queue_depth, zlo->id); @@ -1176,6 +1223,49 @@ static int zloop_ctl_add(struct zloop_options *opts) return ret; } +static void zloop_truncate(struct file *file, loff_t pos) +{ + struct mnt_idmap *idmap = file_mnt_idmap(file); + struct dentry *dentry = file_dentry(file); + struct iattr newattrs; + + newattrs.ia_size = pos; + newattrs.ia_valid = ATTR_SIZE; + + inode_lock(dentry->d_inode); + notify_change(idmap, dentry, &newattrs, NULL); + inode_unlock(dentry->d_inode); +} + +static void zloop_forget_cache(struct zloop_device *zlo) +{ + unsigned int i; + int ret; + + pr_info("%pg: discarding volatile write cache\n", zlo->disk->part0); + + for (i = 0; i < zlo->nr_zones; i++) { + struct zloop_zone *zone = &zlo->zones[i]; + struct file *file = zone->file; + sector_t old_wp; + + if (!zloop_zone_is_active(zone)) + continue; + + ret = vfs_getxattr(file_mnt_idmap(file), file_dentry(file), + "user.zloop.wp", &old_wp, sizeof(old_wp)); + if (ret == -ENODATA) { + old_wp = 0; + } else if (ret != sizeof(old_wp)) { + pr_err("%pg: failed to retrieve write pointer (%d)\n", + zlo->disk->part0, ret); + continue; + } + if (old_wp < zone->wp) + zloop_truncate(file, old_wp); + } +} + static int zloop_ctl_remove(struct zloop_options *opts) { struct zloop_device *zlo; @@ -1210,6 +1300,10 @@ static int zloop_ctl_remove(struct zloop_options *opts) return ret; del_gendisk(zlo->disk); + + if (zlo->discard_write_cache) + zloop_forget_cache(zlo); + put_disk(zlo->disk); pr_info("Removed device %d\n", opts->id); @@ -1361,6 +1455,9 @@ static int zloop_parse_options(struct zloop_options *opts, const char *buf) case ZLOOP_OPT_ORDERED_ZONE_APPEND: opts->ordered_zone_append = true; break; + case ZLOOP_OPT_DISCARD_WRITE_CACHE: + opts->discard_write_cache = true; + break; case ZLOOP_OPT_ERR: default: pr_warn("unknown parameter or missing value '%s'\n", p); -- 2.47.3 ^ permalink raw reply related [flat|nested] 8+ messages in thread
* Re: [PATCH 2/2] zloop: forget write cache on force removal 2026-03-18 5:53 ` [PATCH 2/2] zloop: forget write cache on force removal Christoph Hellwig @ 2026-03-18 7:03 ` Damien Le Moal 0 siblings, 0 replies; 8+ messages in thread From: Damien Le Moal @ 2026-03-18 7:03 UTC (permalink / raw) To: Christoph Hellwig, Jens Axboe; +Cc: linux-block On 3/18/26 2:53 PM, Christoph Hellwig wrote: > Add a new options that causes zloop to truncate the zone files to the > write pointer value recorded at the last cache flush to simulate > unclean shutdowns. > > Signed-off-by: Christoph Hellwig <hch@lst.de> Looks OK to me. One nit below. With that corrected: Reviewed-by: Damien Le Moal <dlemoal@kernel.org> > --- > .../admin-guide/blockdev/zoned_loop.rst | 5 + > drivers/block/zloop.c | 97 +++++++++++++++++++ > 2 files changed, 102 insertions(+) > > diff --git a/Documentation/admin-guide/blockdev/zoned_loop.rst b/Documentation/admin-guide/blockdev/zoned_loop.rst > index 6aa865424ac3..237ee2fccb82 100644 > --- a/Documentation/admin-guide/blockdev/zoned_loop.rst > +++ b/Documentation/admin-guide/blockdev/zoned_loop.rst > @@ -104,6 +104,11 @@ ordered_zone_append Enable zloop mitigation of zone append reordering. > (extents), as when enabled, this can significantly reduce > the number of data extents needed to for a file data > mapping. > +discard_write_cache Discard all data that was not explicitly persisted using a > + flush operation when removed by truncating each zone file flush operation when the device is removed by truncating each zone file... > + to the size recorded during the last flush operation. > + This simulates power fail events where uncommitted data is > + lost. > =================== ========================================================= -- Damien Le Moal Western Digital Research ^ permalink raw reply [flat|nested] 8+ messages in thread
* add a "discard cache" debug option to zloop v2 @ 2026-03-19 6:02 Christoph Hellwig 2026-03-19 6:02 ` [PATCH 1/2] zloop: refactor zloop_rw Christoph Hellwig 0 siblings, 1 reply; 8+ messages in thread From: Christoph Hellwig @ 2026-03-19 6:02 UTC (permalink / raw) To: Damien Le Moal, Jens Axboe; +Cc: linux-block Hi all, this series adds a new option to zloop to lose data not committed to stable storage using a flush operation on device removal. The idea is to help testing that file system code does the right thing in face of volatile write caches. For conventional devices, this can be tested using dm-log-writes, but the concepts there don't work for sequential write required zones. Instead this adds an option to zloop, which records the write pointer at the last cache flush for each zone file in an xattr, and truncates the files down to that value on removal, simulating losing the contents of the volatile write cache. Changes since v1: - fix up the documentation Diffstat: Documentation/admin-guide/blockdev/zoned_loop.rst | 5 drivers/block/zloop.c | 337 ++++++++++++++-------- 2 files changed, 226 insertions(+), 116 deletions(-) ^ permalink raw reply [flat|nested] 8+ messages in thread
* [PATCH 1/2] zloop: refactor zloop_rw 2026-03-19 6:02 add a "discard cache" debug option to zloop v2 Christoph Hellwig @ 2026-03-19 6:02 ` Christoph Hellwig 2026-03-19 14:06 ` Martin K. Petersen 0 siblings, 1 reply; 8+ messages in thread From: Christoph Hellwig @ 2026-03-19 6:02 UTC (permalink / raw) To: Damien Le Moal, Jens Axboe; +Cc: linux-block Split out two helpers functions to make the function more readable and to avoid conditional locking. Signed-off-by: Christoph Hellwig <hch@lst.de> Reviewed-by: Damien Le Moal <dlemoal@kernel.org> --- drivers/block/zloop.c | 240 ++++++++++++++++++++++-------------------- 1 file changed, 124 insertions(+), 116 deletions(-) diff --git a/drivers/block/zloop.c b/drivers/block/zloop.c index 51c043342127..8ca37ca1935a 100644 --- a/drivers/block/zloop.c +++ b/drivers/block/zloop.c @@ -378,125 +378,22 @@ static void zloop_rw_complete(struct kiocb *iocb, long ret) zloop_put_cmd(cmd); } -static void zloop_rw(struct zloop_cmd *cmd) +static int zloop_do_rw(struct zloop_cmd *cmd) { struct request *rq = blk_mq_rq_from_pdu(cmd); + int rw = req_op(rq) == REQ_OP_READ ? ITER_DEST : ITER_SOURCE; + unsigned int nr_bvec = blk_rq_nr_bvec(rq); struct zloop_device *zlo = rq->q->queuedata; - unsigned int zone_no = rq_zone_no(rq); - sector_t sector = blk_rq_pos(rq); - sector_t nr_sectors = blk_rq_sectors(rq); - bool is_append = req_op(rq) == REQ_OP_ZONE_APPEND; - bool is_write = req_op(rq) == REQ_OP_WRITE || is_append; - int rw = is_write ? ITER_SOURCE : ITER_DEST; + struct zloop_zone *zone = &zlo->zones[rq_zone_no(rq)]; struct req_iterator rq_iter; - struct zloop_zone *zone; struct iov_iter iter; - struct bio_vec tmp; - unsigned long flags; - sector_t zone_end; - unsigned int nr_bvec; - int ret; - - atomic_set(&cmd->ref, 2); - cmd->sector = sector; - cmd->nr_sectors = nr_sectors; - cmd->ret = 0; - - if (WARN_ON_ONCE(is_append && !zlo->zone_append)) { - ret = -EIO; - goto out; - } - - /* We should never get an I/O beyond the device capacity. */ - if (WARN_ON_ONCE(zone_no >= zlo->nr_zones)) { - ret = -EIO; - goto out; - } - zone = &zlo->zones[zone_no]; - zone_end = zone->start + zlo->zone_capacity; - - /* - * The block layer should never send requests that are not fully - * contained within the zone. - */ - if (WARN_ON_ONCE(sector + nr_sectors > zone->start + zlo->zone_size)) { - ret = -EIO; - goto out; - } - - if (test_and_clear_bit(ZLOOP_ZONE_SEQ_ERROR, &zone->flags)) { - mutex_lock(&zone->lock); - ret = zloop_update_seq_zone(zlo, zone_no); - mutex_unlock(&zone->lock); - if (ret) - goto out; - } - - if (!test_bit(ZLOOP_ZONE_CONV, &zone->flags) && is_write) { - mutex_lock(&zone->lock); - - spin_lock_irqsave(&zone->wp_lock, flags); - - /* - * Zone append operations always go at the current write - * pointer, but regular write operations must already be - * aligned to the write pointer when submitted. - */ - if (is_append) { - /* - * If ordered zone append is in use, we already checked - * and set the target sector in zloop_queue_rq(). - */ - if (!zlo->ordered_zone_append) { - if (zone->cond == BLK_ZONE_COND_FULL || - zone->wp + nr_sectors > zone_end) { - spin_unlock_irqrestore(&zone->wp_lock, - flags); - ret = -EIO; - goto unlock; - } - sector = zone->wp; - } - cmd->sector = sector; - } else if (sector != zone->wp) { - spin_unlock_irqrestore(&zone->wp_lock, flags); - pr_err("Zone %u: unaligned write: sect %llu, wp %llu\n", - zone_no, sector, zone->wp); - ret = -EIO; - goto unlock; - } - - /* Implicitly open the target zone. */ - if (zone->cond == BLK_ZONE_COND_CLOSED || - zone->cond == BLK_ZONE_COND_EMPTY) - zone->cond = BLK_ZONE_COND_IMP_OPEN; - - /* - * Advance the write pointer, unless ordered zone append is in - * use. If the write fails, the write pointer position will be - * corrected when the next I/O starts execution. - */ - if (!is_append || !zlo->ordered_zone_append) { - zone->wp += nr_sectors; - if (zone->wp == zone_end) { - zone->cond = BLK_ZONE_COND_FULL; - zone->wp = ULLONG_MAX; - } - } - - spin_unlock_irqrestore(&zone->wp_lock, flags); - } - - nr_bvec = blk_rq_nr_bvec(rq); if (rq->bio != rq->biotail) { - struct bio_vec *bvec; + struct bio_vec tmp, *bvec; cmd->bvec = kmalloc_objs(*cmd->bvec, nr_bvec, GFP_NOIO); - if (!cmd->bvec) { - ret = -EIO; - goto unlock; - } + if (!cmd->bvec) + return -EIO; /* * The bios of the request may be started from the middle of @@ -522,7 +419,7 @@ static void zloop_rw(struct zloop_cmd *cmd) iter.iov_offset = rq->bio->bi_iter.bi_bvec_done; } - cmd->iocb.ki_pos = (sector - zone->start) << SECTOR_SHIFT; + cmd->iocb.ki_pos = (cmd->sector - zone->start) << SECTOR_SHIFT; cmd->iocb.ki_filp = zone->file; cmd->iocb.ki_complete = zloop_rw_complete; if (!zlo->buffered_io) @@ -530,12 +427,123 @@ static void zloop_rw(struct zloop_cmd *cmd) cmd->iocb.ki_ioprio = IOPRIO_PRIO_VALUE(IOPRIO_CLASS_NONE, 0); if (rw == ITER_SOURCE) - ret = zone->file->f_op->write_iter(&cmd->iocb, &iter); - else - ret = zone->file->f_op->read_iter(&cmd->iocb, &iter); -unlock: - if (!test_bit(ZLOOP_ZONE_CONV, &zone->flags) && is_write) + return zone->file->f_op->write_iter(&cmd->iocb, &iter); + return zone->file->f_op->read_iter(&cmd->iocb, &iter); +} + +static int zloop_seq_write_prep(struct zloop_cmd *cmd) +{ + struct request *rq = blk_mq_rq_from_pdu(cmd); + struct zloop_device *zlo = rq->q->queuedata; + unsigned int zone_no = rq_zone_no(rq); + sector_t nr_sectors = blk_rq_sectors(rq); + bool is_append = req_op(rq) == REQ_OP_ZONE_APPEND; + struct zloop_zone *zone = &zlo->zones[zone_no]; + sector_t zone_end = zone->start + zlo->zone_capacity; + unsigned long flags; + int ret = 0; + + spin_lock_irqsave(&zone->wp_lock, flags); + + /* + * Zone append operations always go at the current write pointer, but + * regular write operations must already be aligned to the write pointer + * when submitted. + */ + if (is_append) { + /* + * If ordered zone append is in use, we already checked and set + * the target sector in zloop_queue_rq(). + */ + if (!zlo->ordered_zone_append) { + if (zone->cond == BLK_ZONE_COND_FULL || + zone->wp + nr_sectors > zone_end) { + ret = -EIO; + goto out_unlock; + } + cmd->sector = zone->wp; + } + } else { + if (cmd->sector != zone->wp) { + pr_err("Zone %u: unaligned write: sect %llu, wp %llu\n", + zone_no, cmd->sector, zone->wp); + ret = -EIO; + goto out_unlock; + } + } + + /* Implicitly open the target zone. */ + if (zone->cond == BLK_ZONE_COND_CLOSED || + zone->cond == BLK_ZONE_COND_EMPTY) + zone->cond = BLK_ZONE_COND_IMP_OPEN; + + /* + * Advance the write pointer, unless ordered zone append is in use. If + * the write fails, the write pointer position will be corrected when + * the next I/O starts execution. + */ + if (!is_append || !zlo->ordered_zone_append) { + zone->wp += nr_sectors; + if (zone->wp == zone_end) { + zone->cond = BLK_ZONE_COND_FULL; + zone->wp = ULLONG_MAX; + } + } +out_unlock: + spin_unlock_irqrestore(&zone->wp_lock, flags); + return ret; +} + +static void zloop_rw(struct zloop_cmd *cmd) +{ + struct request *rq = blk_mq_rq_from_pdu(cmd); + struct zloop_device *zlo = rq->q->queuedata; + unsigned int zone_no = rq_zone_no(rq); + sector_t nr_sectors = blk_rq_sectors(rq); + bool is_append = req_op(rq) == REQ_OP_ZONE_APPEND; + bool is_write = req_op(rq) == REQ_OP_WRITE || is_append; + struct zloop_zone *zone; + int ret = -EIO; + + atomic_set(&cmd->ref, 2); + cmd->sector = blk_rq_pos(rq); + cmd->nr_sectors = nr_sectors; + cmd->ret = 0; + + if (WARN_ON_ONCE(is_append && !zlo->zone_append)) + goto out; + + /* We should never get an I/O beyond the device capacity. */ + if (WARN_ON_ONCE(zone_no >= zlo->nr_zones)) + goto out; + + zone = &zlo->zones[zone_no]; + + /* + * The block layer should never send requests that are not fully + * contained within the zone. + */ + if (WARN_ON_ONCE(cmd->sector + nr_sectors > + zone->start + zlo->zone_size)) + goto out; + + if (test_and_clear_bit(ZLOOP_ZONE_SEQ_ERROR, &zone->flags)) { + mutex_lock(&zone->lock); + ret = zloop_update_seq_zone(zlo, zone_no); mutex_unlock(&zone->lock); + if (ret) + goto out; + } + + if (!test_bit(ZLOOP_ZONE_CONV, &zone->flags) && is_write) { + mutex_lock(&zone->lock); + ret = zloop_seq_write_prep(cmd); + if (!ret) + ret = zloop_do_rw(cmd); + mutex_unlock(&zone->lock); + } else { + ret = zloop_do_rw(cmd); + } out: if (ret != -EIOCBQUEUED) zloop_rw_complete(&cmd->iocb, ret); -- 2.47.3 ^ permalink raw reply related [flat|nested] 8+ messages in thread
* Re: [PATCH 1/2] zloop: refactor zloop_rw 2026-03-19 6:02 ` [PATCH 1/2] zloop: refactor zloop_rw Christoph Hellwig @ 2026-03-19 14:06 ` Martin K. Petersen 0 siblings, 0 replies; 8+ messages in thread From: Martin K. Petersen @ 2026-03-19 14:06 UTC (permalink / raw) To: Christoph Hellwig; +Cc: Damien Le Moal, Jens Axboe, linux-block Christoph, > Split out two helpers functions to make the function more readable and > to avoid conditional locking. Reviewed-by: Martin K. Petersen <martin.petersen@oracle.com> -- Martin K. Petersen ^ permalink raw reply [flat|nested] 8+ messages in thread
* add a "discard cache" debug option to zloop v3 @ 2026-03-23 7:11 Christoph Hellwig 2026-03-23 7:11 ` [PATCH 1/2] zloop: refactor zloop_rw Christoph Hellwig 0 siblings, 1 reply; 8+ messages in thread From: Christoph Hellwig @ 2026-03-23 7:11 UTC (permalink / raw) To: Damien Le Moal, Jens Axboe; +Cc: linux-block Hi all, this series adds a new option to zloop to lose data not committed to stable storage using a flush operation on device removal. The idea is to help testing that file system code does the right thing in face of volatile write caches. For conventional devices, this can be tested using dm-log-writes, but the concepts there don't work for sequential write required zones. Instead this adds an option to zloop, which records the write pointer at the last cache flush for each zone file in an xattr, and truncates the files down to that value on removal, simulating losing the contents of the volatile write cache. Changes since v2: - spelling fix Changes since v1: - fix up the documentation Diffstat: Documentation/admin-guide/blockdev/zoned_loop.rst | 5 drivers/block/zloop.c | 337 ++++++++++++++-------- 2 files changed, 226 insertions(+), 116 deletions(-) ^ permalink raw reply [flat|nested] 8+ messages in thread
* [PATCH 1/2] zloop: refactor zloop_rw 2026-03-23 7:11 add a "discard cache" debug option to zloop v3 Christoph Hellwig @ 2026-03-23 7:11 ` Christoph Hellwig 0 siblings, 0 replies; 8+ messages in thread From: Christoph Hellwig @ 2026-03-23 7:11 UTC (permalink / raw) To: Damien Le Moal, Jens Axboe Cc: linux-block, Bart Van Assche, Martin K. Petersen Split out two helpers functions to make the function more readable and to avoid conditional locking. Signed-off-by: Christoph Hellwig <hch@lst.de> Reviewed-by: Bart Van Assche <bvanassche@acm.org> Reviewed-by: Damien Le Moal <dlemoal@kernel.org> Reviewed-by: Martin K. Petersen <martin.petersen@oracle.com> --- drivers/block/zloop.c | 240 ++++++++++++++++++++++-------------------- 1 file changed, 124 insertions(+), 116 deletions(-) diff --git a/drivers/block/zloop.c b/drivers/block/zloop.c index 51c043342127..8ca37ca1935a 100644 --- a/drivers/block/zloop.c +++ b/drivers/block/zloop.c @@ -378,125 +378,22 @@ static void zloop_rw_complete(struct kiocb *iocb, long ret) zloop_put_cmd(cmd); } -static void zloop_rw(struct zloop_cmd *cmd) +static int zloop_do_rw(struct zloop_cmd *cmd) { struct request *rq = blk_mq_rq_from_pdu(cmd); + int rw = req_op(rq) == REQ_OP_READ ? ITER_DEST : ITER_SOURCE; + unsigned int nr_bvec = blk_rq_nr_bvec(rq); struct zloop_device *zlo = rq->q->queuedata; - unsigned int zone_no = rq_zone_no(rq); - sector_t sector = blk_rq_pos(rq); - sector_t nr_sectors = blk_rq_sectors(rq); - bool is_append = req_op(rq) == REQ_OP_ZONE_APPEND; - bool is_write = req_op(rq) == REQ_OP_WRITE || is_append; - int rw = is_write ? ITER_SOURCE : ITER_DEST; + struct zloop_zone *zone = &zlo->zones[rq_zone_no(rq)]; struct req_iterator rq_iter; - struct zloop_zone *zone; struct iov_iter iter; - struct bio_vec tmp; - unsigned long flags; - sector_t zone_end; - unsigned int nr_bvec; - int ret; - - atomic_set(&cmd->ref, 2); - cmd->sector = sector; - cmd->nr_sectors = nr_sectors; - cmd->ret = 0; - - if (WARN_ON_ONCE(is_append && !zlo->zone_append)) { - ret = -EIO; - goto out; - } - - /* We should never get an I/O beyond the device capacity. */ - if (WARN_ON_ONCE(zone_no >= zlo->nr_zones)) { - ret = -EIO; - goto out; - } - zone = &zlo->zones[zone_no]; - zone_end = zone->start + zlo->zone_capacity; - - /* - * The block layer should never send requests that are not fully - * contained within the zone. - */ - if (WARN_ON_ONCE(sector + nr_sectors > zone->start + zlo->zone_size)) { - ret = -EIO; - goto out; - } - - if (test_and_clear_bit(ZLOOP_ZONE_SEQ_ERROR, &zone->flags)) { - mutex_lock(&zone->lock); - ret = zloop_update_seq_zone(zlo, zone_no); - mutex_unlock(&zone->lock); - if (ret) - goto out; - } - - if (!test_bit(ZLOOP_ZONE_CONV, &zone->flags) && is_write) { - mutex_lock(&zone->lock); - - spin_lock_irqsave(&zone->wp_lock, flags); - - /* - * Zone append operations always go at the current write - * pointer, but regular write operations must already be - * aligned to the write pointer when submitted. - */ - if (is_append) { - /* - * If ordered zone append is in use, we already checked - * and set the target sector in zloop_queue_rq(). - */ - if (!zlo->ordered_zone_append) { - if (zone->cond == BLK_ZONE_COND_FULL || - zone->wp + nr_sectors > zone_end) { - spin_unlock_irqrestore(&zone->wp_lock, - flags); - ret = -EIO; - goto unlock; - } - sector = zone->wp; - } - cmd->sector = sector; - } else if (sector != zone->wp) { - spin_unlock_irqrestore(&zone->wp_lock, flags); - pr_err("Zone %u: unaligned write: sect %llu, wp %llu\n", - zone_no, sector, zone->wp); - ret = -EIO; - goto unlock; - } - - /* Implicitly open the target zone. */ - if (zone->cond == BLK_ZONE_COND_CLOSED || - zone->cond == BLK_ZONE_COND_EMPTY) - zone->cond = BLK_ZONE_COND_IMP_OPEN; - - /* - * Advance the write pointer, unless ordered zone append is in - * use. If the write fails, the write pointer position will be - * corrected when the next I/O starts execution. - */ - if (!is_append || !zlo->ordered_zone_append) { - zone->wp += nr_sectors; - if (zone->wp == zone_end) { - zone->cond = BLK_ZONE_COND_FULL; - zone->wp = ULLONG_MAX; - } - } - - spin_unlock_irqrestore(&zone->wp_lock, flags); - } - - nr_bvec = blk_rq_nr_bvec(rq); if (rq->bio != rq->biotail) { - struct bio_vec *bvec; + struct bio_vec tmp, *bvec; cmd->bvec = kmalloc_objs(*cmd->bvec, nr_bvec, GFP_NOIO); - if (!cmd->bvec) { - ret = -EIO; - goto unlock; - } + if (!cmd->bvec) + return -EIO; /* * The bios of the request may be started from the middle of @@ -522,7 +419,7 @@ static void zloop_rw(struct zloop_cmd *cmd) iter.iov_offset = rq->bio->bi_iter.bi_bvec_done; } - cmd->iocb.ki_pos = (sector - zone->start) << SECTOR_SHIFT; + cmd->iocb.ki_pos = (cmd->sector - zone->start) << SECTOR_SHIFT; cmd->iocb.ki_filp = zone->file; cmd->iocb.ki_complete = zloop_rw_complete; if (!zlo->buffered_io) @@ -530,12 +427,123 @@ static void zloop_rw(struct zloop_cmd *cmd) cmd->iocb.ki_ioprio = IOPRIO_PRIO_VALUE(IOPRIO_CLASS_NONE, 0); if (rw == ITER_SOURCE) - ret = zone->file->f_op->write_iter(&cmd->iocb, &iter); - else - ret = zone->file->f_op->read_iter(&cmd->iocb, &iter); -unlock: - if (!test_bit(ZLOOP_ZONE_CONV, &zone->flags) && is_write) + return zone->file->f_op->write_iter(&cmd->iocb, &iter); + return zone->file->f_op->read_iter(&cmd->iocb, &iter); +} + +static int zloop_seq_write_prep(struct zloop_cmd *cmd) +{ + struct request *rq = blk_mq_rq_from_pdu(cmd); + struct zloop_device *zlo = rq->q->queuedata; + unsigned int zone_no = rq_zone_no(rq); + sector_t nr_sectors = blk_rq_sectors(rq); + bool is_append = req_op(rq) == REQ_OP_ZONE_APPEND; + struct zloop_zone *zone = &zlo->zones[zone_no]; + sector_t zone_end = zone->start + zlo->zone_capacity; + unsigned long flags; + int ret = 0; + + spin_lock_irqsave(&zone->wp_lock, flags); + + /* + * Zone append operations always go at the current write pointer, but + * regular write operations must already be aligned to the write pointer + * when submitted. + */ + if (is_append) { + /* + * If ordered zone append is in use, we already checked and set + * the target sector in zloop_queue_rq(). + */ + if (!zlo->ordered_zone_append) { + if (zone->cond == BLK_ZONE_COND_FULL || + zone->wp + nr_sectors > zone_end) { + ret = -EIO; + goto out_unlock; + } + cmd->sector = zone->wp; + } + } else { + if (cmd->sector != zone->wp) { + pr_err("Zone %u: unaligned write: sect %llu, wp %llu\n", + zone_no, cmd->sector, zone->wp); + ret = -EIO; + goto out_unlock; + } + } + + /* Implicitly open the target zone. */ + if (zone->cond == BLK_ZONE_COND_CLOSED || + zone->cond == BLK_ZONE_COND_EMPTY) + zone->cond = BLK_ZONE_COND_IMP_OPEN; + + /* + * Advance the write pointer, unless ordered zone append is in use. If + * the write fails, the write pointer position will be corrected when + * the next I/O starts execution. + */ + if (!is_append || !zlo->ordered_zone_append) { + zone->wp += nr_sectors; + if (zone->wp == zone_end) { + zone->cond = BLK_ZONE_COND_FULL; + zone->wp = ULLONG_MAX; + } + } +out_unlock: + spin_unlock_irqrestore(&zone->wp_lock, flags); + return ret; +} + +static void zloop_rw(struct zloop_cmd *cmd) +{ + struct request *rq = blk_mq_rq_from_pdu(cmd); + struct zloop_device *zlo = rq->q->queuedata; + unsigned int zone_no = rq_zone_no(rq); + sector_t nr_sectors = blk_rq_sectors(rq); + bool is_append = req_op(rq) == REQ_OP_ZONE_APPEND; + bool is_write = req_op(rq) == REQ_OP_WRITE || is_append; + struct zloop_zone *zone; + int ret = -EIO; + + atomic_set(&cmd->ref, 2); + cmd->sector = blk_rq_pos(rq); + cmd->nr_sectors = nr_sectors; + cmd->ret = 0; + + if (WARN_ON_ONCE(is_append && !zlo->zone_append)) + goto out; + + /* We should never get an I/O beyond the device capacity. */ + if (WARN_ON_ONCE(zone_no >= zlo->nr_zones)) + goto out; + + zone = &zlo->zones[zone_no]; + + /* + * The block layer should never send requests that are not fully + * contained within the zone. + */ + if (WARN_ON_ONCE(cmd->sector + nr_sectors > + zone->start + zlo->zone_size)) + goto out; + + if (test_and_clear_bit(ZLOOP_ZONE_SEQ_ERROR, &zone->flags)) { + mutex_lock(&zone->lock); + ret = zloop_update_seq_zone(zlo, zone_no); mutex_unlock(&zone->lock); + if (ret) + goto out; + } + + if (!test_bit(ZLOOP_ZONE_CONV, &zone->flags) && is_write) { + mutex_lock(&zone->lock); + ret = zloop_seq_write_prep(cmd); + if (!ret) + ret = zloop_do_rw(cmd); + mutex_unlock(&zone->lock); + } else { + ret = zloop_do_rw(cmd); + } out: if (ret != -EIOCBQUEUED) zloop_rw_complete(&cmd->iocb, ret); -- 2.47.3 ^ permalink raw reply related [flat|nested] 8+ messages in thread
end of thread, other threads:[~2026-03-23 7:12 UTC | newest] Thread overview: 8+ messages (download: mbox.gz follow: Atom feed -- links below jump to the message on this page -- 2026-03-18 5:53 add a "discard cache" debug option to zloop Christoph Hellwig 2026-03-18 5:53 ` [PATCH 1/2] zloop: refactor zloop_rw Christoph Hellwig 2026-03-18 6:58 ` Damien Le Moal 2026-03-18 5:53 ` [PATCH 2/2] zloop: forget write cache on force removal Christoph Hellwig 2026-03-18 7:03 ` Damien Le Moal -- strict thread matches above, loose matches on Subject: below -- 2026-03-19 6:02 add a "discard cache" debug option to zloop v2 Christoph Hellwig 2026-03-19 6:02 ` [PATCH 1/2] zloop: refactor zloop_rw Christoph Hellwig 2026-03-19 14:06 ` Martin K. Petersen 2026-03-23 7:11 add a "discard cache" debug option to zloop v3 Christoph Hellwig 2026-03-23 7:11 ` [PATCH 1/2] zloop: refactor zloop_rw Christoph Hellwig
This is a public inbox, see mirroring instructions for how to clone and mirror all data and code used for this inbox