* bio completion in task enhancements / experiments
@ 2026-04-09 16:02 Christoph Hellwig
2026-04-09 16:02 ` [PATCH 1/8] block: add BIO_COMPLETE_IN_TASK for task-context completion Christoph Hellwig
` (7 more replies)
0 siblings, 8 replies; 9+ messages in thread
From: Christoph Hellwig @ 2026-04-09 16:02 UTC (permalink / raw)
To: Tal Zussman, Jens Axboe, Matthew Wilcox (Oracle),
Christian Brauner, Darrick J. Wong, Carlos Maiolino, Al Viro,
Jan Kara
Cc: Dave Chinner, Bart Van Assche, Gao Xiang, linux-block,
linux-kernel, linux-xfs, linux-fsdevel, linux-mm
Hi all,
this series builds on top of:
Subject: [PATCH RFC v5 0/3] block: enable RWF_DONTCACHE for block devices
which I fixed up to apply to linux-next. If you want to seriously review
or test this, you're best off using the git branch here:
https://git.infradead.org/?p=users/hch/misc.git;a=shortlog;h=refs/heads/bio-task-completion
it first makes the complete in task interface more flexible so that it
can also be used from inside the ->bi_end_io handlers, which we'll need
for a few uses cases. The second patch fixes the offload condition, the
next two then convert to uses in iomap added in the current merge window
over to the interface.
The last patch plays with the implementation and reuses concepts from
erofs to reduce the completion latency at the expense of more always
alive threads.
There's a few other places that could benefit from this, like erofs
decompression, PI verification in the block and file systems paths, or
fscrypt decryption.
Diffstat:
block/bio.c | 93 ++++++++++++++++++++++++++++++++++++++++++++
block/fops.c | 5 +-
fs/buffer.c | 25 ++++++++++-
fs/iomap/bio.c | 44 --------------------
fs/iomap/ioend.c | 53 +++----------------------
fs/xfs/xfs_aops.c | 4 -
include/linux/bio.h | 28 +++++++++++++
include/linux/blk_types.h | 6 ++
include/linux/buffer_head.h | 5 ++
9 files changed, 165 insertions(+), 98 deletions(-)
^ permalink raw reply [flat|nested] 9+ messages in thread
* [PATCH 1/8] block: add BIO_COMPLETE_IN_TASK for task-context completion
2026-04-09 16:02 bio completion in task enhancements / experiments Christoph Hellwig
@ 2026-04-09 16:02 ` Christoph Hellwig
2026-04-09 16:02 ` [PATCH 2/8] iomap: use BIO_COMPLETE_IN_TASK for dropbehind writeback Christoph Hellwig
` (6 subsequent siblings)
7 siblings, 0 replies; 9+ messages in thread
From: Christoph Hellwig @ 2026-04-09 16:02 UTC (permalink / raw)
To: Tal Zussman, Jens Axboe, Matthew Wilcox (Oracle),
Christian Brauner, Darrick J. Wong, Carlos Maiolino, Al Viro,
Jan Kara
Cc: Dave Chinner, Bart Van Assche, Gao Xiang, linux-block,
linux-kernel, linux-xfs, linux-fsdevel, linux-mm
From: Tal Zussman <tz2294@columbia.edu>
Some bio completion handlers need to run in task context but bio_endio()
can be called from IRQ context (e.g. buffer_head writeback). Add a
BIO_COMPLETE_IN_TASK flag that bio submitters can set to request
task-context completion of their bi_end_io callback.
When bio_endio() sees this flag and is running in non-task context, it
queues the bio to a per-cpu lockless list and schedules a delayed work
item to call bi_end_io() from task context. The delayed work uses a
1-jiffie delay to allow batches of completions to accumulate before
processing. A CPU hotplug dead callback drains any remaining bios from
the departing CPU's batch.
This will be used to enable RWF_DONTCACHE for block devices, and could
be used for other subsystems like fscrypt that need task-context bio
completion.
Suggested-by: Matthew Wilcox <willy@infradead.org>
Signed-off-by: Tal Zussman <tz2294@columbia.edu>
---
block/bio.c | 83 ++++++++++++++++++++++++++++++++++++++-
include/linux/blk_types.h | 7 +++-
2 files changed, 88 insertions(+), 2 deletions(-)
diff --git a/block/bio.c b/block/bio.c
index 641ef0928d73..550eb770bfa6 100644
--- a/block/bio.c
+++ b/block/bio.c
@@ -19,6 +19,7 @@
#include <linux/blk-crypto.h>
#include <linux/xarray.h>
#include <linux/kmemleak.h>
+#include <linux/llist.h>
#include <trace/events/block.h>
#include "blk.h"
@@ -1716,6 +1717,51 @@ void bio_check_pages_dirty(struct bio *bio)
}
EXPORT_SYMBOL_GPL(bio_check_pages_dirty);
+struct bio_complete_batch {
+ struct llist_head list;
+ struct delayed_work work;
+ int cpu;
+};
+
+static DEFINE_PER_CPU(struct bio_complete_batch, bio_complete_batch);
+static struct workqueue_struct *bio_complete_wq;
+
+static void bio_complete_work_fn(struct work_struct *w)
+{
+ struct delayed_work *dw = to_delayed_work(w);
+ struct bio_complete_batch *batch =
+ container_of(dw, struct bio_complete_batch, work);
+ struct llist_node *node;
+ struct bio *bio, *next;
+
+ do {
+ node = llist_del_all(&batch->list);
+ if (!node)
+ break;
+
+ node = llist_reverse_order(node);
+ llist_for_each_entry_safe(bio, next, node, bi_llist)
+ bio->bi_end_io(bio);
+
+ if (need_resched()) {
+ if (!llist_empty(&batch->list))
+ mod_delayed_work_on(batch->cpu,
+ bio_complete_wq,
+ &batch->work, 0);
+ break;
+ }
+ } while (1);
+}
+
+static void bio_queue_completion(struct bio *bio)
+{
+ struct bio_complete_batch *batch = this_cpu_ptr(&bio_complete_batch);
+
+ if (llist_add(&bio->bi_llist, &batch->list))
+ mod_delayed_work_on(batch->cpu, bio_complete_wq,
+ &batch->work, 1);
+}
+
static inline bool bio_remaining_done(struct bio *bio)
{
/*
@@ -1790,7 +1836,9 @@ void bio_endio(struct bio *bio)
}
#endif
- if (bio->bi_end_io)
+ if (!in_task() && bio_flagged(bio, BIO_COMPLETE_IN_TASK))
+ bio_queue_completion(bio);
+ else if (bio->bi_end_io)
bio->bi_end_io(bio);
}
EXPORT_SYMBOL(bio_endio);
@@ -1976,6 +2024,24 @@ int bioset_init(struct bio_set *bs,
}
EXPORT_SYMBOL(bioset_init);
+/*
+ * Drain a dead CPU's deferred bio completions.
+ */
+static int bio_complete_batch_cpu_dead(unsigned int cpu)
+{
+ struct bio_complete_batch *batch =
+ per_cpu_ptr(&bio_complete_batch, cpu);
+ struct llist_node *node;
+ struct bio *bio, *next;
+
+ node = llist_del_all(&batch->list);
+ node = llist_reverse_order(node);
+ llist_for_each_entry_safe(bio, next, node, bi_llist)
+ bio->bi_end_io(bio);
+
+ return 0;
+}
+
static int __init init_bio(void)
{
int i;
@@ -1990,6 +2056,21 @@ static int __init init_bio(void)
SLAB_HWCACHE_ALIGN | SLAB_PANIC, NULL);
}
+ for_each_possible_cpu(i) {
+ struct bio_complete_batch *batch =
+ per_cpu_ptr(&bio_complete_batch, i);
+
+ init_llist_head(&batch->list);
+ INIT_DELAYED_WORK(&batch->work, bio_complete_work_fn);
+ batch->cpu = i;
+ }
+
+ bio_complete_wq = alloc_workqueue("bio_complete", WQ_MEM_RECLAIM, 0);
+ if (!bio_complete_wq)
+ panic("bio: can't allocate bio_complete workqueue\n");
+
+ cpuhp_setup_state(CPUHP_BP_PREPARE_DYN, "block/bio:complete:dead",
+ NULL, bio_complete_batch_cpu_dead);
cpuhp_setup_state_multi(CPUHP_BIO_DEAD, "block/bio:dead", NULL,
bio_cpu_dead);
diff --git a/include/linux/blk_types.h b/include/linux/blk_types.h
index 8808ee76e73c..0b55159d110d 100644
--- a/include/linux/blk_types.h
+++ b/include/linux/blk_types.h
@@ -11,6 +11,7 @@
#include <linux/device.h>
#include <linux/ktime.h>
#include <linux/rw_hint.h>
+#include <linux/llist.h>
struct bio_set;
struct bio;
@@ -208,7 +209,10 @@ typedef unsigned int blk_qc_t;
* stacking drivers)
*/
struct bio {
- struct bio *bi_next; /* request queue link */
+ union {
+ struct bio *bi_next; /* request queue link */
+ struct llist_node bi_llist; /* deferred completion */
+ };
struct block_device *bi_bdev;
blk_opf_t bi_opf; /* bottom bits REQ_OP, top bits
* req_flags.
@@ -322,6 +326,7 @@ enum {
BIO_REMAPPED,
BIO_ZONE_WRITE_PLUGGING, /* bio handled through zone write plugging */
BIO_EMULATES_ZONE_APPEND, /* bio emulates a zone append operation */
+ BIO_COMPLETE_IN_TASK, /* complete bi_end_io() in task context */
BIO_FLAG_LAST
};
--
2.47.3
^ permalink raw reply related [flat|nested] 9+ messages in thread
* [PATCH 2/8] iomap: use BIO_COMPLETE_IN_TASK for dropbehind writeback
2026-04-09 16:02 bio completion in task enhancements / experiments Christoph Hellwig
2026-04-09 16:02 ` [PATCH 1/8] block: add BIO_COMPLETE_IN_TASK for task-context completion Christoph Hellwig
@ 2026-04-09 16:02 ` Christoph Hellwig
2026-04-09 16:02 ` [PATCH 3/8] block: enable RWF_DONTCACHE for block devices Christoph Hellwig
` (5 subsequent siblings)
7 siblings, 0 replies; 9+ messages in thread
From: Christoph Hellwig @ 2026-04-09 16:02 UTC (permalink / raw)
To: Tal Zussman, Jens Axboe, Matthew Wilcox (Oracle),
Christian Brauner, Darrick J. Wong, Carlos Maiolino, Al Viro,
Jan Kara
Cc: Dave Chinner, Bart Van Assche, Gao Xiang, linux-block,
linux-kernel, linux-xfs, linux-fsdevel, linux-mm
From: Tal Zussman <tz2294@columbia.edu>
Set BIO_COMPLETE_IN_TASK on iomap writeback bios when a dropbehind folio
is added. This ensures that bi_end_io runs in task context, where
folio_end_dropbehind() can safely invalidate folios.
With the bio layer now handling task-context deferral generically,
IOMAP_IOEND_DONTCACHE is no longer needed, as XFS no longer needs to
route DONTCACHE ioends through its completion workqueue. Remove the flag
and its NOMERGE entry.
Without the NOMERGE, regular I/Os that get merged with a dropbehind
folio will also have their completion deferred to task context.
Signed-off-by: Tal Zussman <tz2294@columbia.edu>
---
fs/iomap/ioend.c | 5 +++--
fs/xfs/xfs_aops.c | 4 ----
include/linux/iomap.h | 6 +-----
3 files changed, 4 insertions(+), 11 deletions(-)
diff --git a/fs/iomap/ioend.c b/fs/iomap/ioend.c
index acf3cf98b23a..892dbfc77ae9 100644
--- a/fs/iomap/ioend.c
+++ b/fs/iomap/ioend.c
@@ -237,8 +237,6 @@ ssize_t iomap_add_to_ioend(struct iomap_writepage_ctx *wpc, struct folio *folio,
if (wpc->iomap.flags & IOMAP_F_SHARED)
ioend_flags |= IOMAP_IOEND_SHARED;
- if (folio_test_dropbehind(folio))
- ioend_flags |= IOMAP_IOEND_DONTCACHE;
if (pos == wpc->iomap.offset && (wpc->iomap.flags & IOMAP_F_BOUNDARY))
ioend_flags |= IOMAP_IOEND_BOUNDARY;
@@ -255,6 +253,9 @@ ssize_t iomap_add_to_ioend(struct iomap_writepage_ctx *wpc, struct folio *folio,
if (!bio_add_folio(&ioend->io_bio, folio, map_len, poff))
goto new_ioend;
+ if (folio_test_dropbehind(folio))
+ bio_set_flag(&ioend->io_bio, BIO_COMPLETE_IN_TASK);
+
/*
* Clamp io_offset and io_size to the incore EOF so that ondisk
* file size updates in the ioend completion are byte-accurate.
diff --git a/fs/xfs/xfs_aops.c b/fs/xfs/xfs_aops.c
index f279055fcea0..0dcf78beae8a 100644
--- a/fs/xfs/xfs_aops.c
+++ b/fs/xfs/xfs_aops.c
@@ -511,10 +511,6 @@ xfs_ioend_needs_wq_completion(
if (ioend->io_flags & (IOMAP_IOEND_UNWRITTEN | IOMAP_IOEND_SHARED))
return true;
- /* Page cache invalidation cannot be done in irq context. */
- if (ioend->io_flags & IOMAP_IOEND_DONTCACHE)
- return true;
-
return false;
}
diff --git a/include/linux/iomap.h b/include/linux/iomap.h
index 2c5685adf3a9..bf49ba71dd42 100644
--- a/include/linux/iomap.h
+++ b/include/linux/iomap.h
@@ -399,16 +399,12 @@ sector_t iomap_bmap(struct address_space *mapping, sector_t bno,
#define IOMAP_IOEND_BOUNDARY (1U << 2)
/* is direct I/O */
#define IOMAP_IOEND_DIRECT (1U << 3)
-/* is DONTCACHE I/O */
-#define IOMAP_IOEND_DONTCACHE (1U << 4)
-
/*
* Flags that if set on either ioend prevent the merge of two ioends.
* (IOMAP_IOEND_BOUNDARY also prevents merges, but only one-way)
*/
#define IOMAP_IOEND_NOMERGE_FLAGS \
- (IOMAP_IOEND_SHARED | IOMAP_IOEND_UNWRITTEN | IOMAP_IOEND_DIRECT | \
- IOMAP_IOEND_DONTCACHE)
+ (IOMAP_IOEND_SHARED | IOMAP_IOEND_UNWRITTEN | IOMAP_IOEND_DIRECT)
/*
* Structure for writeback I/O completions.
--
2.47.3
^ permalink raw reply related [flat|nested] 9+ messages in thread
* [PATCH 3/8] block: enable RWF_DONTCACHE for block devices
2026-04-09 16:02 bio completion in task enhancements / experiments Christoph Hellwig
2026-04-09 16:02 ` [PATCH 1/8] block: add BIO_COMPLETE_IN_TASK for task-context completion Christoph Hellwig
2026-04-09 16:02 ` [PATCH 2/8] iomap: use BIO_COMPLETE_IN_TASK for dropbehind writeback Christoph Hellwig
@ 2026-04-09 16:02 ` Christoph Hellwig
2026-04-09 16:02 ` [PATCH 4/8] FOLD: block: change the defer in task context interface to be procedural Christoph Hellwig
` (4 subsequent siblings)
7 siblings, 0 replies; 9+ messages in thread
From: Christoph Hellwig @ 2026-04-09 16:02 UTC (permalink / raw)
To: Tal Zussman, Jens Axboe, Matthew Wilcox (Oracle),
Christian Brauner, Darrick J. Wong, Carlos Maiolino, Al Viro,
Jan Kara
Cc: Dave Chinner, Bart Van Assche, Gao Xiang, linux-block,
linux-kernel, linux-xfs, linux-fsdevel, linux-mm
From: Tal Zussman <tz2294@columbia.edu>
Block device buffered reads and writes already pass through
filemap_read() and iomap_file_buffered_write() respectively, both of
which handle IOCB_DONTCACHE. Enable RWF_DONTCACHE for block device files
by setting FOP_DONTCACHE in def_blk_fops.
For CONFIG_BUFFER_HEAD=y paths, add block_write_begin_iocb() which
threads the kiocb through so that buffer_head-based I/O can use
DONTCACHE behavior. The existing block_write_begin() is preserved as a
wrapper that passes a NULL iocb. Set BIO_COMPLETE_IN_TASK in
submit_bh_wbc() when the folio has dropbehind so that buffer_head
writeback completions get deferred to task context.
CONFIG_BUFFER_HEAD=n paths are handled by the previously added iomap
BIO_COMPLETE_IN_TASK support.
This support is useful for databases that operate on raw block devices,
among other userspace applications.
Signed-off-by: Tal Zussman <tz2294@columbia.edu>
---
block/fops.c | 5 +++--
fs/buffer.c | 22 +++++++++++++++++++---
include/linux/buffer_head.h | 3 +++
3 files changed, 25 insertions(+), 5 deletions(-)
diff --git a/block/fops.c b/block/fops.c
index bb6642b45937..31b073181d87 100644
--- a/block/fops.c
+++ b/block/fops.c
@@ -504,7 +504,8 @@ static int blkdev_write_begin(const struct kiocb *iocb,
unsigned len, struct folio **foliop,
void **fsdata)
{
- return block_write_begin(mapping, pos, len, foliop, blkdev_get_block);
+ return block_write_begin_iocb(iocb, mapping, pos, len, foliop,
+ blkdev_get_block);
}
static int blkdev_write_end(const struct kiocb *iocb,
@@ -966,7 +967,7 @@ const struct file_operations def_blk_fops = {
.splice_write = iter_file_splice_write,
.fallocate = blkdev_fallocate,
.uring_cmd = blkdev_uring_cmd,
- .fop_flags = FOP_BUFFER_RASYNC,
+ .fop_flags = FOP_BUFFER_RASYNC | FOP_DONTCACHE,
};
static __init int blkdev_init(void)
diff --git a/fs/buffer.c b/fs/buffer.c
index d6e062c42a8d..289ab33fe3fd 100644
--- a/fs/buffer.c
+++ b/fs/buffer.c
@@ -2131,14 +2131,19 @@ EXPORT_SYMBOL(block_commit_write);
*
* The filesystem needs to handle block truncation upon failure.
*/
-int block_write_begin(struct address_space *mapping, loff_t pos, unsigned len,
+int block_write_begin_iocb(const struct kiocb *iocb,
+ struct address_space *mapping, loff_t pos, unsigned len,
struct folio **foliop, get_block_t *get_block)
{
pgoff_t index = pos >> PAGE_SHIFT;
+ fgf_t fgp_flags = FGP_WRITEBEGIN;
struct folio *folio;
int status;
- folio = __filemap_get_folio(mapping, index, FGP_WRITEBEGIN,
+ if (iocb && iocb->ki_flags & IOCB_DONTCACHE)
+ fgp_flags |= FGP_DONTCACHE;
+
+ folio = __filemap_get_folio(mapping, index, fgp_flags,
mapping_gfp_mask(mapping));
if (IS_ERR(folio))
return PTR_ERR(folio);
@@ -2153,6 +2158,13 @@ int block_write_begin(struct address_space *mapping, loff_t pos, unsigned len,
*foliop = folio;
return status;
}
+
+int block_write_begin(struct address_space *mapping, loff_t pos, unsigned len,
+ struct folio **foliop, get_block_t *get_block)
+{
+ return block_write_begin_iocb(NULL, mapping, pos, len, foliop,
+ get_block);
+}
EXPORT_SYMBOL(block_write_begin);
int block_write_end(loff_t pos, unsigned len, unsigned copied,
@@ -2481,7 +2493,8 @@ int cont_write_begin(const struct kiocb *iocb, struct address_space *mapping,
(*bytes)++;
}
- return block_write_begin(mapping, pos, len, foliop, get_block);
+ return block_write_begin_iocb(iocb, mapping, pos, len, foliop,
+ get_block);
}
EXPORT_SYMBOL(cont_write_begin);
@@ -2711,6 +2724,9 @@ static void submit_bh_wbc(blk_opf_t opf, struct buffer_head *bh,
if (IS_ENABLED(CONFIG_FS_ENCRYPTION))
buffer_set_crypto_ctx(bio, bh, GFP_NOIO);
+ if (folio_test_dropbehind(bh->b_folio))
+ bio_set_flag(bio, BIO_COMPLETE_IN_TASK);
+
bio->bi_iter.bi_sector = bh->b_blocknr * (bh->b_size >> 9);
bio->bi_write_hint = write_hint;
diff --git a/include/linux/buffer_head.h b/include/linux/buffer_head.h
index e4939e33b4b5..4ce50882d621 100644
--- a/include/linux/buffer_head.h
+++ b/include/linux/buffer_head.h
@@ -260,6 +260,9 @@ int block_read_full_folio(struct folio *, get_block_t *);
bool block_is_partially_uptodate(struct folio *, size_t from, size_t count);
int block_write_begin(struct address_space *mapping, loff_t pos, unsigned len,
struct folio **foliop, get_block_t *get_block);
+int block_write_begin_iocb(const struct kiocb *iocb,
+ struct address_space *mapping, loff_t pos, unsigned len,
+ struct folio **foliop, get_block_t *get_block);
int __block_write_begin(struct folio *folio, loff_t pos, unsigned len,
get_block_t *get_block);
int block_write_end(loff_t pos, unsigned len, unsigned copied, struct folio *);
--
2.47.3
^ permalink raw reply related [flat|nested] 9+ messages in thread
* [PATCH 4/8] FOLD: block: change the defer in task context interface to be procedural
2026-04-09 16:02 bio completion in task enhancements / experiments Christoph Hellwig
` (2 preceding siblings ...)
2026-04-09 16:02 ` [PATCH 3/8] block: enable RWF_DONTCACHE for block devices Christoph Hellwig
@ 2026-04-09 16:02 ` Christoph Hellwig
2026-04-09 16:02 ` [PATCH 5/8] FOLD: don't use in_task() to decide for offloading Christoph Hellwig
` (3 subsequent siblings)
7 siblings, 0 replies; 9+ messages in thread
From: Christoph Hellwig @ 2026-04-09 16:02 UTC (permalink / raw)
To: Tal Zussman, Jens Axboe, Matthew Wilcox (Oracle),
Christian Brauner, Darrick J. Wong, Carlos Maiolino, Al Viro,
Jan Kara
Cc: Dave Chinner, Bart Van Assche, Gao Xiang, linux-block,
linux-kernel, linux-xfs, linux-fsdevel, linux-mm
Replace the bio-flag based interface with an explicit
bio_complete_in_task() API. The advantage is that this can also be
called from inside the ->bi_end_io callback and thus dynamically.
This will be important to use it for fserror reporting.
Signed-off-by: Christoph Hellwig <hch@lst.de>
---
block/bio.c | 7 +++----
fs/buffer.c | 5 ++++-
fs/iomap/ioend.c | 11 ++++++++---
include/linux/bio.h | 17 +++++++++++++++++
include/linux/blk_types.h | 1 -
include/linux/buffer_head.h | 2 ++
include/linux/iomap.h | 6 +++++-
7 files changed, 39 insertions(+), 10 deletions(-)
diff --git a/block/bio.c b/block/bio.c
index 550eb770bfa6..88d191455762 100644
--- a/block/bio.c
+++ b/block/bio.c
@@ -1753,7 +1753,7 @@ static void bio_complete_work_fn(struct work_struct *w)
} while (1);
}
-static void bio_queue_completion(struct bio *bio)
+void __bio_complete_in_task(struct bio *bio)
{
struct bio_complete_batch *batch = this_cpu_ptr(&bio_complete_batch);
@@ -1761,6 +1761,7 @@ static void bio_queue_completion(struct bio *bio)
mod_delayed_work_on(batch->cpu, bio_complete_wq,
&batch->work, 1);
}
+EXPORT_SYMBOL_GPL(__bio_complete_in_task);
static inline bool bio_remaining_done(struct bio *bio)
{
@@ -1836,9 +1837,7 @@ void bio_endio(struct bio *bio)
}
#endif
- if (!in_task() && bio_flagged(bio, BIO_COMPLETE_IN_TASK))
- bio_queue_completion(bio);
- else if (bio->bi_end_io)
+ if (bio->bi_end_io)
bio->bi_end_io(bio);
}
EXPORT_SYMBOL(bio_endio);
diff --git a/fs/buffer.c b/fs/buffer.c
index 289ab33fe3fd..b5de776c8491 100644
--- a/fs/buffer.c
+++ b/fs/buffer.c
@@ -2673,6 +2673,9 @@ static void end_bio_bh_io_sync(struct bio *bio)
{
struct buffer_head *bh = bio->bi_private;
+ if (buffer_dropbehind(bh) && bio_complete_in_task(bio))
+ return;
+
if (unlikely(bio_flagged(bio, BIO_QUIET)))
set_bit(BH_Quiet, &bh->b_state);
@@ -2725,7 +2728,7 @@ static void submit_bh_wbc(blk_opf_t opf, struct buffer_head *bh,
buffer_set_crypto_ctx(bio, bh, GFP_NOIO);
if (folio_test_dropbehind(bh->b_folio))
- bio_set_flag(bio, BIO_COMPLETE_IN_TASK);
+ set_buffer_dropbehind(bh);
bio->bi_iter.bi_sector = bh->b_blocknr * (bh->b_size >> 9);
bio->bi_write_hint = write_hint;
diff --git a/fs/iomap/ioend.c b/fs/iomap/ioend.c
index 892dbfc77ae9..a32ece8a3ee3 100644
--- a/fs/iomap/ioend.c
+++ b/fs/iomap/ioend.c
@@ -117,6 +117,12 @@ static void ioend_writeback_end_bio(struct bio *bio)
{
struct iomap_ioend *ioend = iomap_ioend_from_bio(bio);
+ /* Page cache invalidation cannot be done in irq context. */
+ if (ioend->io_flags & IOMAP_IOEND_DONTCACHE) {
+ if (bio_complete_in_task(bio))
+ return;
+ }
+
ioend->io_error = blk_status_to_errno(bio->bi_status);
if (ioend->io_error) {
iomap_fail_ioend_buffered(ioend);
@@ -237,6 +243,8 @@ ssize_t iomap_add_to_ioend(struct iomap_writepage_ctx *wpc, struct folio *folio,
if (wpc->iomap.flags & IOMAP_F_SHARED)
ioend_flags |= IOMAP_IOEND_SHARED;
+ if (folio_test_dropbehind(folio))
+ ioend_flags |= IOMAP_IOEND_DONTCACHE;
if (pos == wpc->iomap.offset && (wpc->iomap.flags & IOMAP_F_BOUNDARY))
ioend_flags |= IOMAP_IOEND_BOUNDARY;
@@ -253,9 +261,6 @@ ssize_t iomap_add_to_ioend(struct iomap_writepage_ctx *wpc, struct folio *folio,
if (!bio_add_folio(&ioend->io_bio, folio, map_len, poff))
goto new_ioend;
- if (folio_test_dropbehind(folio))
- bio_set_flag(&ioend->io_bio, BIO_COMPLETE_IN_TASK);
-
/*
* Clamp io_offset and io_size to the incore EOF so that ondisk
* file size updates in the ioend completion are byte-accurate.
diff --git a/include/linux/bio.h b/include/linux/bio.h
index 0b6744557b42..45c311e5ff71 100644
--- a/include/linux/bio.h
+++ b/include/linux/bio.h
@@ -375,6 +375,23 @@ static inline struct bio *bio_alloc(struct block_device *bdev,
void submit_bio(struct bio *bio);
+void __bio_complete_in_task(struct bio *bio);
+
+/**
+ * bio_complete_in_task - ensure a bio is complete in preemptible task context
+ * @bio: bio to complete
+ *
+ * If called from non-task context, offload the bio completion to worker thread
+ * and return %true. Else return %false and do nothing.
+ */
+static inline bool bio_complete_in_task(struct bio *bio)
+{
+ if (in_task())
+ return false;
+ __bio_complete_in_task(bio);
+ return true;
+}
+
extern void bio_endio(struct bio *);
static inline void bio_io_error(struct bio *bio)
diff --git a/include/linux/blk_types.h b/include/linux/blk_types.h
index 0b55159d110d..8419f42de14f 100644
--- a/include/linux/blk_types.h
+++ b/include/linux/blk_types.h
@@ -326,7 +326,6 @@ enum {
BIO_REMAPPED,
BIO_ZONE_WRITE_PLUGGING, /* bio handled through zone write plugging */
BIO_EMULATES_ZONE_APPEND, /* bio emulates a zone append operation */
- BIO_COMPLETE_IN_TASK, /* complete bi_end_io() in task context */
BIO_FLAG_LAST
};
diff --git a/include/linux/buffer_head.h b/include/linux/buffer_head.h
index 4ce50882d621..bd7df5883cc8 100644
--- a/include/linux/buffer_head.h
+++ b/include/linux/buffer_head.h
@@ -35,6 +35,7 @@ enum bh_state_bits {
BH_Prio, /* Buffer should be submitted with REQ_PRIO */
BH_Defer_Completion, /* Defer AIO completion to workqueue */
BH_Migrate, /* Buffer is being migrated (norefs) */
+ BH_Dropbehind, /* drop pages on IO completion */
BH_PrivateStart,/* not a state bit, but the first bit available
* for private allocation by other entities
@@ -136,6 +137,7 @@ BUFFER_FNS(Unwritten, unwritten)
BUFFER_FNS(Meta, meta)
BUFFER_FNS(Prio, prio)
BUFFER_FNS(Defer_Completion, defer_completion)
+BUFFER_FNS(Dropbehind, dropbehind)
static __always_inline void set_buffer_uptodate(struct buffer_head *bh)
{
diff --git a/include/linux/iomap.h b/include/linux/iomap.h
index bf49ba71dd42..2c5685adf3a9 100644
--- a/include/linux/iomap.h
+++ b/include/linux/iomap.h
@@ -399,12 +399,16 @@ sector_t iomap_bmap(struct address_space *mapping, sector_t bno,
#define IOMAP_IOEND_BOUNDARY (1U << 2)
/* is direct I/O */
#define IOMAP_IOEND_DIRECT (1U << 3)
+/* is DONTCACHE I/O */
+#define IOMAP_IOEND_DONTCACHE (1U << 4)
+
/*
* Flags that if set on either ioend prevent the merge of two ioends.
* (IOMAP_IOEND_BOUNDARY also prevents merges, but only one-way)
*/
#define IOMAP_IOEND_NOMERGE_FLAGS \
- (IOMAP_IOEND_SHARED | IOMAP_IOEND_UNWRITTEN | IOMAP_IOEND_DIRECT)
+ (IOMAP_IOEND_SHARED | IOMAP_IOEND_UNWRITTEN | IOMAP_IOEND_DIRECT | \
+ IOMAP_IOEND_DONTCACHE)
/*
* Structure for writeback I/O completions.
--
2.47.3
^ permalink raw reply related [flat|nested] 9+ messages in thread
* [PATCH 5/8] FOLD: don't use in_task() to decide for offloading
2026-04-09 16:02 bio completion in task enhancements / experiments Christoph Hellwig
` (3 preceding siblings ...)
2026-04-09 16:02 ` [PATCH 4/8] FOLD: block: change the defer in task context interface to be procedural Christoph Hellwig
@ 2026-04-09 16:02 ` Christoph Hellwig
2026-04-09 16:02 ` [PATCH 6/8] iomap: use bio_complete_in_task for buffered read errors Christoph Hellwig
` (2 subsequent siblings)
7 siblings, 0 replies; 9+ messages in thread
From: Christoph Hellwig @ 2026-04-09 16:02 UTC (permalink / raw)
To: Tal Zussman, Jens Axboe, Matthew Wilcox (Oracle),
Christian Brauner, Darrick J. Wong, Carlos Maiolino, Al Viro,
Jan Kara
Cc: Dave Chinner, Bart Van Assche, Gao Xiang, linux-block,
linux-kernel, linux-xfs, linux-fsdevel, linux-mm
As described in commit c99fab6e80b76, some block drivers might call
into ->bi_end_io from non-preemptible context. Copy and past the
logic from that commit, although having a core helper for it would
be nicer.
Signed-off-by: Christoph Hellwig <hch@lst.de>
---
include/linux/bio.h | 19 +++++++++++++++----
1 file changed, 15 insertions(+), 4 deletions(-)
diff --git a/include/linux/bio.h b/include/linux/bio.h
index 45c311e5ff71..72664807c757 100644
--- a/include/linux/bio.h
+++ b/include/linux/bio.h
@@ -375,6 +375,16 @@ static inline struct bio *bio_alloc(struct block_device *bdev,
void submit_bio(struct bio *bio);
+/* Offload from atomic contexts to minimize scheduling overhead */
+static inline bool bio_in_atomic(void)
+{
+ if (IS_ENABLED(CONFIG_PREEMPTION) && rcu_preempt_depth())
+ return true;
+ if (!IS_ENABLED(CONFIG_PREEMPT_COUNT))
+ return true;
+ return !preemptible();
+}
+
void __bio_complete_in_task(struct bio *bio);
/**
@@ -386,10 +396,11 @@ void __bio_complete_in_task(struct bio *bio);
*/
static inline bool bio_complete_in_task(struct bio *bio)
{
- if (in_task())
- return false;
- __bio_complete_in_task(bio);
- return true;
+ if (bio_in_atomic()) {
+ __bio_complete_in_task(bio);
+ return true;
+ }
+ return false;
}
extern void bio_endio(struct bio *);
--
2.47.3
^ permalink raw reply related [flat|nested] 9+ messages in thread
* [PATCH 6/8] iomap: use bio_complete_in_task for buffered read errors
2026-04-09 16:02 bio completion in task enhancements / experiments Christoph Hellwig
` (4 preceding siblings ...)
2026-04-09 16:02 ` [PATCH 5/8] FOLD: don't use in_task() to decide for offloading Christoph Hellwig
@ 2026-04-09 16:02 ` Christoph Hellwig
2026-04-09 16:02 ` [PATCH 7/8] iomap: use bio_complete_in_task for buffered write completions Christoph Hellwig
2026-04-09 16:02 ` [PATCH 8/8] RFC: use a TASK_FIFO kthread for read completion support Christoph Hellwig
7 siblings, 0 replies; 9+ messages in thread
From: Christoph Hellwig @ 2026-04-09 16:02 UTC (permalink / raw)
To: Tal Zussman, Jens Axboe, Matthew Wilcox (Oracle),
Christian Brauner, Darrick J. Wong, Carlos Maiolino, Al Viro,
Jan Kara
Cc: Dave Chinner, Bart Van Assche, Gao Xiang, linux-block,
linux-kernel, linux-xfs, linux-fsdevel, linux-mm
Replace out own hand-crafted complete in task context scheme with the
generic block code.
Signed-off-by: Christoph Hellwig <hch@lst.de>
---
fs/iomap/bio.c | 44 +-------------------------------------------
1 file changed, 1 insertion(+), 43 deletions(-)
diff --git a/fs/iomap/bio.c b/fs/iomap/bio.c
index 4504f4633f17..5b9b91198ec8 100644
--- a/fs/iomap/bio.c
+++ b/fs/iomap/bio.c
@@ -9,9 +9,6 @@
#include "internal.h"
#include "trace.h"
-static DEFINE_SPINLOCK(failed_read_lock);
-static struct bio_list failed_read_list = BIO_EMPTY_LIST;
-
static u32 __iomap_read_end_io(struct bio *bio, int error)
{
struct folio_iter fi;
@@ -27,49 +24,10 @@ static u32 __iomap_read_end_io(struct bio *bio, int error)
return folio_count;
}
-static void
-iomap_fail_reads(
- struct work_struct *work)
-{
- struct bio *bio;
- struct bio_list tmp = BIO_EMPTY_LIST;
- unsigned long flags;
-
- spin_lock_irqsave(&failed_read_lock, flags);
- bio_list_merge_init(&tmp, &failed_read_list);
- spin_unlock_irqrestore(&failed_read_lock, flags);
-
- while ((bio = bio_list_pop(&tmp)) != NULL) {
- __iomap_read_end_io(bio, blk_status_to_errno(bio->bi_status));
- cond_resched();
- }
-}
-
-static DECLARE_WORK(failed_read_work, iomap_fail_reads);
-
-static void iomap_fail_buffered_read(struct bio *bio)
-{
- unsigned long flags;
-
- /*
- * Bounce I/O errors to a workqueue to avoid nested i_lock acquisitions
- * in the fserror code. The caller no longer owns the bio reference
- * after the spinlock drops.
- */
- spin_lock_irqsave(&failed_read_lock, flags);
- if (bio_list_empty(&failed_read_list))
- WARN_ON_ONCE(!schedule_work(&failed_read_work));
- bio_list_add(&failed_read_list, bio);
- spin_unlock_irqrestore(&failed_read_lock, flags);
-}
-
static void iomap_read_end_io(struct bio *bio)
{
- if (bio->bi_status) {
- iomap_fail_buffered_read(bio);
+ if (bio->bi_status && bio_complete_in_task(bio))
return;
- }
-
__iomap_read_end_io(bio, 0);
}
--
2.47.3
^ permalink raw reply related [flat|nested] 9+ messages in thread
* [PATCH 7/8] iomap: use bio_complete_in_task for buffered write completions
2026-04-09 16:02 bio completion in task enhancements / experiments Christoph Hellwig
` (5 preceding siblings ...)
2026-04-09 16:02 ` [PATCH 6/8] iomap: use bio_complete_in_task for buffered read errors Christoph Hellwig
@ 2026-04-09 16:02 ` Christoph Hellwig
2026-04-09 16:02 ` [PATCH 8/8] RFC: use a TASK_FIFO kthread for read completion support Christoph Hellwig
7 siblings, 0 replies; 9+ messages in thread
From: Christoph Hellwig @ 2026-04-09 16:02 UTC (permalink / raw)
To: Tal Zussman, Jens Axboe, Matthew Wilcox (Oracle),
Christian Brauner, Darrick J. Wong, Carlos Maiolino, Al Viro,
Jan Kara
Cc: Dave Chinner, Bart Van Assche, Gao Xiang, linux-block,
linux-kernel, linux-xfs, linux-fsdevel, linux-mm
Replace out own hand-crafted complete in task context scheme with the
generic block code.
Signed-off-by: Christoph Hellwig <hch@lst.de>
---
fs/iomap/ioend.c | 53 +++++-------------------------------------------
1 file changed, 5 insertions(+), 48 deletions(-)
diff --git a/fs/iomap/ioend.c b/fs/iomap/ioend.c
index a32ece8a3ee3..160224007486 100644
--- a/fs/iomap/ioend.c
+++ b/fs/iomap/ioend.c
@@ -72,63 +72,20 @@ static u32 iomap_finish_ioend_buffered_write(struct iomap_ioend *ioend)
return folio_count;
}
-static DEFINE_SPINLOCK(failed_ioend_lock);
-static LIST_HEAD(failed_ioend_list);
-
-static void
-iomap_fail_ioends(
- struct work_struct *work)
-{
- struct iomap_ioend *ioend;
- struct list_head tmp;
- unsigned long flags;
-
- spin_lock_irqsave(&failed_ioend_lock, flags);
- list_replace_init(&failed_ioend_list, &tmp);
- spin_unlock_irqrestore(&failed_ioend_lock, flags);
-
- while ((ioend = list_first_entry_or_null(&tmp, struct iomap_ioend,
- io_list))) {
- list_del_init(&ioend->io_list);
- iomap_finish_ioend_buffered_write(ioend);
- cond_resched();
- }
-}
-
-static DECLARE_WORK(failed_ioend_work, iomap_fail_ioends);
-
-static void iomap_fail_ioend_buffered(struct iomap_ioend *ioend)
-{
- unsigned long flags;
-
- /*
- * Bounce I/O errors to a workqueue to avoid nested i_lock acquisitions
- * in the fserror code. The caller no longer owns the ioend reference
- * after the spinlock drops.
- */
- spin_lock_irqsave(&failed_ioend_lock, flags);
- if (list_empty(&failed_ioend_list))
- WARN_ON_ONCE(!schedule_work(&failed_ioend_work));
- list_add_tail(&ioend->io_list, &failed_ioend_list);
- spin_unlock_irqrestore(&failed_ioend_lock, flags);
-}
-
static void ioend_writeback_end_bio(struct bio *bio)
{
struct iomap_ioend *ioend = iomap_ioend_from_bio(bio);
- /* Page cache invalidation cannot be done in irq context. */
- if (ioend->io_flags & IOMAP_IOEND_DONTCACHE) {
+ /*
+ * Page cache invalidation and error reporting cannot be done in irq
+ * context.
+ */
+ if ((ioend->io_flags & IOMAP_IOEND_DONTCACHE) || bio->bi_status) {
if (bio_complete_in_task(bio))
return;
}
ioend->io_error = blk_status_to_errno(bio->bi_status);
- if (ioend->io_error) {
- iomap_fail_ioend_buffered(ioend);
- return;
- }
-
iomap_finish_ioend_buffered_write(ioend);
}
--
2.47.3
^ permalink raw reply related [flat|nested] 9+ messages in thread
* [PATCH 8/8] RFC: use a TASK_FIFO kthread for read completion support
2026-04-09 16:02 bio completion in task enhancements / experiments Christoph Hellwig
` (6 preceding siblings ...)
2026-04-09 16:02 ` [PATCH 7/8] iomap: use bio_complete_in_task for buffered write completions Christoph Hellwig
@ 2026-04-09 16:02 ` Christoph Hellwig
7 siblings, 0 replies; 9+ messages in thread
From: Christoph Hellwig @ 2026-04-09 16:02 UTC (permalink / raw)
To: Tal Zussman, Jens Axboe, Matthew Wilcox (Oracle),
Christian Brauner, Darrick J. Wong, Carlos Maiolino, Al Viro,
Jan Kara
Cc: Dave Chinner, Bart Van Assche, Gao Xiang, linux-block,
linux-kernel, linux-xfs, linux-fsdevel, linux-mm
Commit 3fffb589b9a6 ("erofs: add per-cpu threads for decompression as an
option") explains why workqueue aren't great for low-latency completion
handling. Switch to a per-cpu kthread to handle it instead. This code
is based on the erofs code in the above commit, but further simplified
by directly using a kthread instead of a kthread_work.
Signed-off-by: Christoph Hellwig <hch@lst.de>
---
block/bio.c | 117 +++++++++++++++++++++++++++++-----------------------
1 file changed, 65 insertions(+), 52 deletions(-)
diff --git a/block/bio.c b/block/bio.c
index 88d191455762..6a993fb129a0 100644
--- a/block/bio.c
+++ b/block/bio.c
@@ -19,7 +19,7 @@
#include <linux/blk-crypto.h>
#include <linux/xarray.h>
#include <linux/kmemleak.h>
-#include <linux/llist.h>
+#include <linux/freezer.h>
#include <trace/events/block.h>
#include "blk.h"
@@ -1718,51 +1718,83 @@ void bio_check_pages_dirty(struct bio *bio)
EXPORT_SYMBOL_GPL(bio_check_pages_dirty);
struct bio_complete_batch {
- struct llist_head list;
- struct delayed_work work;
- int cpu;
+ spinlock_t lock;
+ struct bio_list bios;
+ struct task_struct *worker;
};
static DEFINE_PER_CPU(struct bio_complete_batch, bio_complete_batch);
-static struct workqueue_struct *bio_complete_wq;
-static void bio_complete_work_fn(struct work_struct *w)
+static bool bio_try_complete_batch(struct bio_complete_batch *batch)
{
- struct delayed_work *dw = to_delayed_work(w);
- struct bio_complete_batch *batch =
- container_of(dw, struct bio_complete_batch, work);
- struct llist_node *node;
- struct bio *bio, *next;
+ struct bio_list bios;
+ unsigned long flags;
+ struct bio *bio;
- do {
- node = llist_del_all(&batch->list);
- if (!node)
- break;
+ spin_lock_irqsave(&batch->lock, flags);
+ bios = batch->bios;
+ bio_list_init(&batch->bios);
+ spin_unlock_irqrestore(&batch->lock, flags);
- node = llist_reverse_order(node);
- llist_for_each_entry_safe(bio, next, node, bi_llist)
- bio->bi_end_io(bio);
+ if (bio_list_empty(&bios))
+ return false;
- if (need_resched()) {
- if (!llist_empty(&batch->list))
- mod_delayed_work_on(batch->cpu,
- bio_complete_wq,
- &batch->work, 0);
- break;
- }
- } while (1);
+ __set_current_state(TASK_RUNNING);
+ while ((bio = bio_list_pop(&bios)))
+ bio->bi_end_io(bio);
+ return true;
+}
+
+static int bio_complete_thread(void *private)
+{
+ struct bio_complete_batch *batch = private;
+
+ for (;;) {
+ set_current_state(TASK_INTERRUPTIBLE);
+ if (!bio_try_complete_batch(batch))
+ schedule();
+ }
+
+ return 0;
}
void __bio_complete_in_task(struct bio *bio)
{
- struct bio_complete_batch *batch = this_cpu_ptr(&bio_complete_batch);
+ struct bio_complete_batch *batch;
+ unsigned long flags;
+ bool wake;
+
+ get_cpu();
+ batch = this_cpu_ptr(&bio_complete_batch);
+ spin_lock_irqsave(&batch->lock, flags);
+ wake = bio_list_empty(&batch->bios);
+ bio_list_add(&batch->bios, bio);
+ spin_unlock_irqrestore(&batch->lock, flags);
+ put_cpu();
- if (llist_add(&bio->bi_llist, &batch->list))
- mod_delayed_work_on(batch->cpu, bio_complete_wq,
- &batch->work, 1);
+ if (wake)
+ wake_up_process(batch->worker);
}
EXPORT_SYMBOL_GPL(__bio_complete_in_task);
+static void __init bio_complete_batch_init(int cpu)
+{
+ struct bio_complete_batch *batch =
+ per_cpu_ptr(&bio_complete_batch, cpu);
+ struct task_struct *worker;
+
+ worker = kthread_create_on_cpu(bio_complete_thread,
+ per_cpu_ptr(&bio_complete_batch, cpu),
+ cpu, "bio_worker/%u");
+ if (IS_ERR(worker))
+ panic("bio: can't create kthread_work");
+ sched_set_fifo_low(worker);
+
+ spin_lock_init(&batch->lock);
+ bio_list_init(&batch->bios);
+ batch->worker = worker;
+}
+
static inline bool bio_remaining_done(struct bio *bio)
{
/*
@@ -2028,16 +2060,7 @@ EXPORT_SYMBOL(bioset_init);
*/
static int bio_complete_batch_cpu_dead(unsigned int cpu)
{
- struct bio_complete_batch *batch =
- per_cpu_ptr(&bio_complete_batch, cpu);
- struct llist_node *node;
- struct bio *bio, *next;
-
- node = llist_del_all(&batch->list);
- node = llist_reverse_order(node);
- llist_for_each_entry_safe(bio, next, node, bi_llist)
- bio->bi_end_io(bio);
-
+ bio_try_complete_batch(per_cpu_ptr(&bio_complete_batch, cpu));
return 0;
}
@@ -2055,18 +2078,8 @@ static int __init init_bio(void)
SLAB_HWCACHE_ALIGN | SLAB_PANIC, NULL);
}
- for_each_possible_cpu(i) {
- struct bio_complete_batch *batch =
- per_cpu_ptr(&bio_complete_batch, i);
-
- init_llist_head(&batch->list);
- INIT_DELAYED_WORK(&batch->work, bio_complete_work_fn);
- batch->cpu = i;
- }
-
- bio_complete_wq = alloc_workqueue("bio_complete", WQ_MEM_RECLAIM, 0);
- if (!bio_complete_wq)
- panic("bio: can't allocate bio_complete workqueue\n");
+ for_each_possible_cpu(i)
+ bio_complete_batch_init(i);
cpuhp_setup_state(CPUHP_BP_PREPARE_DYN, "block/bio:complete:dead",
NULL, bio_complete_batch_cpu_dead);
--
2.47.3
^ permalink raw reply related [flat|nested] 9+ messages in thread
end of thread, other threads:[~2026-04-09 16:04 UTC | newest]
Thread overview: 9+ messages (download: mbox.gz follow: Atom feed
-- links below jump to the message on this page --
2026-04-09 16:02 bio completion in task enhancements / experiments Christoph Hellwig
2026-04-09 16:02 ` [PATCH 1/8] block: add BIO_COMPLETE_IN_TASK for task-context completion Christoph Hellwig
2026-04-09 16:02 ` [PATCH 2/8] iomap: use BIO_COMPLETE_IN_TASK for dropbehind writeback Christoph Hellwig
2026-04-09 16:02 ` [PATCH 3/8] block: enable RWF_DONTCACHE for block devices Christoph Hellwig
2026-04-09 16:02 ` [PATCH 4/8] FOLD: block: change the defer in task context interface to be procedural Christoph Hellwig
2026-04-09 16:02 ` [PATCH 5/8] FOLD: don't use in_task() to decide for offloading Christoph Hellwig
2026-04-09 16:02 ` [PATCH 6/8] iomap: use bio_complete_in_task for buffered read errors Christoph Hellwig
2026-04-09 16:02 ` [PATCH 7/8] iomap: use bio_complete_in_task for buffered write completions Christoph Hellwig
2026-04-09 16:02 ` [PATCH 8/8] RFC: use a TASK_FIFO kthread for read completion support Christoph Hellwig
This is a public inbox, see mirroring instructions
for how to clone and mirror all data and code used for this inbox