[PATCH 09/14] iomap: share code between iomap_dio_bio_end_io and iomap_finish_ioend

public inbox for linux-fsdevel@vger.kernel.org
 help / color / mirror / Atom feed

* [PATCH 09/14] iomap: share code between iomap_dio_bio_end_io and iomap_finish_ioend_direct
  2026-01-14  7:40 bounce buffer direct I/O when stable pages are required Christoph Hellwig
@ 2026-01-14  7:41 ` Christoph Hellwig
  2026-01-14 22:54   ` Darrick J. Wong
  0 siblings, 1 reply; 75+ messages in thread
From: Christoph Hellwig @ 2026-01-14  7:41 UTC (permalink / raw)
  To: Jens Axboe, Christian Brauner
  Cc: Darrick J. Wong, Carlos Maiolino, Qu Wenruo, Al Viro, linux-block,
	linux-xfs, linux-fsdevel

Refactor the two per-bio completion handlers to share common code using
a new helper.

Signed-off-by: Christoph Hellwig <hch@lst.de>
---
 fs/iomap/direct-io.c | 42 +++++++++++++++++++-----------------------
 1 file changed, 19 insertions(+), 23 deletions(-)

diff --git a/fs/iomap/direct-io.c b/fs/iomap/direct-io.c
index 63374ba83b55..bf59241a090b 100644
--- a/fs/iomap/direct-io.c
+++ b/fs/iomap/direct-io.c
@@ -210,16 +210,20 @@ static void iomap_dio_done(struct iomap_dio *dio)
 	iomap_dio_complete_work(&dio->aio.work);
 }
 
-void iomap_dio_bio_end_io(struct bio *bio)
+static void __iomap_dio_bio_end_io(struct bio *bio, bool inline_completion)
 {
 	struct iomap_dio *dio = bio->bi_private;
 	bool should_dirty = (dio->flags & IOMAP_DIO_DIRTY);
 
-	if (bio->bi_status)
-		iomap_dio_set_error(dio, blk_status_to_errno(bio->bi_status));
-
-	if (atomic_dec_and_test(&dio->ref))
+	if (atomic_dec_and_test(&dio->ref)) {
+		/*
+		 * Avoid another context switch for the completion when already
+		 * called from the ioend completion workqueue.
+		 */
+		if (inline_completion)
+			dio->flags &= ~IOMAP_DIO_COMP_WORK;
 		iomap_dio_done(dio);
+	}
 
 	if (should_dirty) {
 		bio_check_pages_dirty(bio);
@@ -228,33 +232,25 @@ void iomap_dio_bio_end_io(struct bio *bio)
 		bio_put(bio);
 	}
 }
+
+void iomap_dio_bio_end_io(struct bio *bio)
+{
+	struct iomap_dio *dio = bio->bi_private;
+
+	if (bio->bi_status)
+		iomap_dio_set_error(dio, blk_status_to_errno(bio->bi_status));
+	__iomap_dio_bio_end_io(bio, false);
+}
 EXPORT_SYMBOL_GPL(iomap_dio_bio_end_io);
 
 u32 iomap_finish_ioend_direct(struct iomap_ioend *ioend)
 {
 	struct iomap_dio *dio = ioend->io_bio.bi_private;
-	bool should_dirty = (dio->flags & IOMAP_DIO_DIRTY);
 	u32 vec_count = ioend->io_bio.bi_vcnt;
 
 	if (ioend->io_error)
 		iomap_dio_set_error(dio, ioend->io_error);
-
-	if (atomic_dec_and_test(&dio->ref)) {
-		/*
-		 * Try to avoid another context switch for the completion given
-		 * that we are already called from the ioend completion
-		 * workqueue.
-		 */
-		dio->flags &= ~IOMAP_DIO_COMP_WORK;
-		iomap_dio_done(dio);
-	}
-
-	if (should_dirty) {
-		bio_check_pages_dirty(&ioend->io_bio);
-	} else {
-		bio_release_pages(&ioend->io_bio, false);
-		bio_put(&ioend->io_bio);
-	}
+	__iomap_dio_bio_end_io(&ioend->io_bio, true);
 
 	/*
 	 * Return the number of bvecs completed as even direct I/O completions
-- 
2.47.3


^ permalink raw reply related	[flat|nested] 75+ messages in thread

* Re: [PATCH 09/14] iomap: share code between iomap_dio_bio_end_io and iomap_finish_ioend_direct
  2026-01-14  7:41 ` [PATCH 09/14] iomap: share code between iomap_dio_bio_end_io and iomap_finish_ioend_direct Christoph Hellwig
@ 2026-01-14 22:54   ` Darrick J. Wong
  0 siblings, 0 replies; 75+ messages in thread
From: Darrick J. Wong @ 2026-01-14 22:54 UTC (permalink / raw)
  To: Christoph Hellwig
  Cc: Jens Axboe, Christian Brauner, Carlos Maiolino, Qu Wenruo,
	Al Viro, linux-block, linux-xfs, linux-fsdevel

On Wed, Jan 14, 2026 at 08:41:07AM +0100, Christoph Hellwig wrote:
> Refactor the two per-bio completion handlers to share common code using
> a new helper.
> 
> Signed-off-by: Christoph Hellwig <hch@lst.de>

Looks good,
Reviewed-by: "Darrick J. Wong" <djwong@kernel.org>

--D

> ---
>  fs/iomap/direct-io.c | 42 +++++++++++++++++++-----------------------
>  1 file changed, 19 insertions(+), 23 deletions(-)
> 
> diff --git a/fs/iomap/direct-io.c b/fs/iomap/direct-io.c
> index 63374ba83b55..bf59241a090b 100644
> --- a/fs/iomap/direct-io.c
> +++ b/fs/iomap/direct-io.c
> @@ -210,16 +210,20 @@ static void iomap_dio_done(struct iomap_dio *dio)
>  	iomap_dio_complete_work(&dio->aio.work);
>  }
>  
> -void iomap_dio_bio_end_io(struct bio *bio)
> +static void __iomap_dio_bio_end_io(struct bio *bio, bool inline_completion)
>  {
>  	struct iomap_dio *dio = bio->bi_private;
>  	bool should_dirty = (dio->flags & IOMAP_DIO_DIRTY);
>  
> -	if (bio->bi_status)
> -		iomap_dio_set_error(dio, blk_status_to_errno(bio->bi_status));
> -
> -	if (atomic_dec_and_test(&dio->ref))
> +	if (atomic_dec_and_test(&dio->ref)) {
> +		/*
> +		 * Avoid another context switch for the completion when already
> +		 * called from the ioend completion workqueue.
> +		 */
> +		if (inline_completion)
> +			dio->flags &= ~IOMAP_DIO_COMP_WORK;
>  		iomap_dio_done(dio);
> +	}
>  
>  	if (should_dirty) {
>  		bio_check_pages_dirty(bio);
> @@ -228,33 +232,25 @@ void iomap_dio_bio_end_io(struct bio *bio)
>  		bio_put(bio);
>  	}
>  }
> +
> +void iomap_dio_bio_end_io(struct bio *bio)
> +{
> +	struct iomap_dio *dio = bio->bi_private;
> +
> +	if (bio->bi_status)
> +		iomap_dio_set_error(dio, blk_status_to_errno(bio->bi_status));
> +	__iomap_dio_bio_end_io(bio, false);
> +}
>  EXPORT_SYMBOL_GPL(iomap_dio_bio_end_io);
>  
>  u32 iomap_finish_ioend_direct(struct iomap_ioend *ioend)
>  {
>  	struct iomap_dio *dio = ioend->io_bio.bi_private;
> -	bool should_dirty = (dio->flags & IOMAP_DIO_DIRTY);
>  	u32 vec_count = ioend->io_bio.bi_vcnt;
>  
>  	if (ioend->io_error)
>  		iomap_dio_set_error(dio, ioend->io_error);
> -
> -	if (atomic_dec_and_test(&dio->ref)) {
> -		/*
> -		 * Try to avoid another context switch for the completion given
> -		 * that we are already called from the ioend completion
> -		 * workqueue.
> -		 */
> -		dio->flags &= ~IOMAP_DIO_COMP_WORK;
> -		iomap_dio_done(dio);
> -	}
> -
> -	if (should_dirty) {
> -		bio_check_pages_dirty(&ioend->io_bio);
> -	} else {
> -		bio_release_pages(&ioend->io_bio, false);
> -		bio_put(&ioend->io_bio);
> -	}
> +	__iomap_dio_bio_end_io(&ioend->io_bio, true);
>  
>  	/*
>  	 * Return the number of bvecs completed as even direct I/O completions
> -- 
> 2.47.3
> 
> 

^ permalink raw reply	[flat|nested] 75+ messages in thread

* bounce buffer direct I/O when stable pages are required v2
@ 2026-01-19  7:44 ` Christoph Hellwig
  2026-01-19  7:44   ` [PATCH 01/14] block: refactor get_contig_folio_len Christoph Hellwig
                     ` (15 more replies)
  0 siblings, 16 replies; 75+ messages in thread
From: Christoph Hellwig @ 2026-01-19  7:44 UTC (permalink / raw)
  To: Jens Axboe, Christian Brauner
  Cc: Darrick J. Wong, Carlos Maiolino, Qu Wenruo, Al Viro, linux-block,
	linux-xfs, linux-fsdevel

Hi all,

this series tries to address the problem that under I/O pages can be
modified during direct I/O, even when the device or file system require
stable pages during I/O to calculate checksums, parity or data
operations.  It does so by adding block layer helpers to bounce buffer
an iov_iter into a bio, then wires that up in iomap and ultimately
XFS.

The reason that the file system even needs to know about it, is because
reads need a user context to copy the data back, and the infrastructure
to defer ioends to a workqueue currently sits in XFS.  I'm going to look
into moving that into ioend and enabling it for other file systems.
Additionally btrfs already has it's own infrastructure for this, and
actually an urgent need to bounce buffer, so this should be useful there
and could be wire up easily.  In fact the idea comes from patches by
Qu that did this in btrfs.

This patch fixes all but one xfstests failures on T10 PI capable devices
(generic/095 seems to have issues with a mix of mmap and splice still,
I'm looking into that separate), and make qemu VMs running Windows,
or Linux with swap enabled fine on an XFS file on a device using PI.

Performance numbers on my (not exactly state of the art) NVMe PI test
setup:

  Sequential reads using io_uring, QD=16.
  Bandwidth and CPU usage (usr/sys):

  | size |        zero copy         |          bounce          |
  +------+--------------------------+--------------------------+
  |   4k | 1316MiB/s (12.65/55.40%) | 1081MiB/s (11.76/49.78%) |
  |  64K | 3370MiB/s ( 5.46/18.20%) | 3365MiB/s ( 4.47/15.68%) |
  |   1M | 3401MiB/s ( 0.76/23.05%) | 3400MiB/s ( 0.80/09.06%) |
  +------+--------------------------+--------------------------+

  Sequential writes using io_uring, QD=16.
  Bandwidth and CPU usage (usr/sys):

  | size |        zero copy         |          bounce          |
  +------+--------------------------+--------------------------+
  |   4k |  882MiB/s (11.83/33.88%) |  750MiB/s (10.53/34.08%) |
  |  64K | 2009MiB/s ( 7.33/15.80%) | 2007MiB/s ( 7.47/24.71%) |
  |   1M | 1992MiB/s ( 7.26/ 9.13%) | 1992MiB/s ( 9.21/19.11%) |
  +------+--------------------------+--------------------------+

Note that the 64k read numbers look really odd to me for the baseline
zero copy case, but are reproducible over many repeated runs.

The bounce read numbers should further improve when moving the PI
validation to the file system and removing the double context switch,
which I have patches for that will sent out soon.

Changes since v1:
 - spelling fixes
 - add more details to some commit messages
 - add a new code comment about freeing the bio early in the I/O
   completion handler

Diffstat:
 block/bio.c           |  323 ++++++++++++++++++++++++++++++--------------------
 block/blk.h           |   11 -
 fs/iomap/direct-io.c  |  191 ++++++++++++++++-------------
 fs/iomap/ioend.c      |    8 +
 fs/xfs/xfs_aops.c     |    8 -
 fs/xfs/xfs_file.c     |   41 +++++-
 include/linux/bio.h   |   26 ++++
 include/linux/iomap.h |    9 +
 include/linux/uio.h   |    3 
 lib/iov_iter.c        |   98 +++++++++++++++
 10 files changed, 492 insertions(+), 226 deletions(-)

^ permalink raw reply	[flat|nested] 75+ messages in thread

* [PATCH 01/14] block: refactor get_contig_folio_len
  2026-01-19  7:44 ` bounce buffer direct I/O when stable pages are required v2 Christoph Hellwig
@ 2026-01-19  7:44   ` Christoph Hellwig
  2026-01-22 11:00     ` Johannes Thumshirn
                       ` (4 more replies)
  2026-01-19  7:44   ` [PATCH 02/14] block: open code bio_add_page and fix handling of mismatching P2P ranges Christoph Hellwig
                     ` (14 subsequent siblings)
  15 siblings, 5 replies; 75+ messages in thread
From: Christoph Hellwig @ 2026-01-19  7:44 UTC (permalink / raw)
  To: Jens Axboe, Christian Brauner
  Cc: Darrick J. Wong, Carlos Maiolino, Qu Wenruo, Al Viro, linux-block,
	linux-xfs, linux-fsdevel

Move all of the logic to find the contigous length inside a folio into
get_contig_folio_len instead of keeping some of it in the caller.

Signed-off-by: Christoph Hellwig <hch@lst.de>
---
 block/bio.c | 62 +++++++++++++++++++++++------------------------------
 1 file changed, 27 insertions(+), 35 deletions(-)

diff --git a/block/bio.c b/block/bio.c
index 2359c0723b88..18dfdaba0c73 100644
--- a/block/bio.c
+++ b/block/bio.c
@@ -1172,33 +1172,35 @@ void bio_iov_bvec_set(struct bio *bio, const struct iov_iter *iter)
 	bio_set_flag(bio, BIO_CLONED);
 }
 
-static unsigned int get_contig_folio_len(unsigned int *num_pages,
-					 struct page **pages, unsigned int i,
-					 struct folio *folio, size_t left,
+static unsigned int get_contig_folio_len(struct page **pages,
+					 unsigned int *num_pages, size_t left,
 					 size_t offset)
 {
-	size_t bytes = left;
-	size_t contig_sz = min_t(size_t, PAGE_SIZE - offset, bytes);
-	unsigned int j;
+	struct folio *folio = page_folio(pages[0]);
+	size_t contig_sz = min_t(size_t, PAGE_SIZE - offset, left);
+	unsigned int max_pages, i;
+	size_t folio_offset, len;
+
+	folio_offset = PAGE_SIZE * folio_page_idx(folio, pages[0]) + offset;
+	len = min(folio_size(folio) - folio_offset, left);
 
 	/*
-	 * We might COW a single page in the middle of
-	 * a large folio, so we have to check that all
-	 * pages belong to the same folio.
+	 * We might COW a single page in the middle of a large folio, so we have
+	 * to check that all pages belong to the same folio.
 	 */
-	bytes -= contig_sz;
-	for (j = i + 1; j < i + *num_pages; j++) {
-		size_t next = min_t(size_t, PAGE_SIZE, bytes);
+	left -= contig_sz;
+	max_pages = DIV_ROUND_UP(offset + len, PAGE_SIZE);
+	for (i = 1; i < max_pages; i++) {
+		size_t next = min_t(size_t, PAGE_SIZE, left);
 
-		if (page_folio(pages[j]) != folio ||
-		    pages[j] != pages[j - 1] + 1) {
+		if (page_folio(pages[i]) != folio ||
+		    pages[i] != pages[i - 1] + 1)
 			break;
-		}
 		contig_sz += next;
-		bytes -= next;
+		left -= next;
 	}
-	*num_pages = j - i;
 
+	*num_pages = i;
 	return contig_sz;
 }
 
@@ -1222,8 +1224,8 @@ static int __bio_iov_iter_get_pages(struct bio *bio, struct iov_iter *iter)
 	struct bio_vec *bv = bio->bi_io_vec + bio->bi_vcnt;
 	struct page **pages = (struct page **)bv;
 	ssize_t size;
-	unsigned int num_pages, i = 0;
-	size_t offset, folio_offset, left, len;
+	unsigned int i = 0;
+	size_t offset, left, len;
 	int ret = 0;
 
 	/*
@@ -1244,23 +1246,12 @@ static int __bio_iov_iter_get_pages(struct bio *bio, struct iov_iter *iter)
 		return size ? size : -EFAULT;
 
 	nr_pages = DIV_ROUND_UP(offset + size, PAGE_SIZE);
-	for (left = size, i = 0; left > 0; left -= len, i += num_pages) {
-		struct page *page = pages[i];
-		struct folio *folio = page_folio(page);
+	for (left = size; left > 0; left -= len) {
 		unsigned int old_vcnt = bio->bi_vcnt;
+		unsigned int nr_to_add;
 
-		folio_offset = ((size_t)folio_page_idx(folio, page) <<
-			       PAGE_SHIFT) + offset;
-
-		len = min(folio_size(folio) - folio_offset, left);
-
-		num_pages = DIV_ROUND_UP(offset + len, PAGE_SIZE);
-
-		if (num_pages > 1)
-			len = get_contig_folio_len(&num_pages, pages, i,
-						   folio, left, offset);
-
-		if (!bio_add_folio(bio, folio, len, folio_offset)) {
+		len = get_contig_folio_len(&pages[i], &nr_to_add, left, offset);
+		if (!bio_add_page(bio, pages[i], len, offset)) {
 			WARN_ON_ONCE(1);
 			ret = -EINVAL;
 			goto out;
@@ -1275,8 +1266,9 @@ static int __bio_iov_iter_get_pages(struct bio *bio, struct iov_iter *iter)
 			 * single pin per page.
 			 */
 			if (offset && bio->bi_vcnt == old_vcnt)
-				unpin_user_folio(folio, 1);
+				unpin_user_folio(page_folio(pages[i]), 1);
 		}
+		i += nr_to_add;
 		offset = 0;
 	}
 
-- 
2.47.3


^ permalink raw reply related	[flat|nested] 75+ messages in thread

* [PATCH 02/14] block: open code bio_add_page and fix handling of mismatching P2P ranges
  2026-01-19  7:44 ` bounce buffer direct I/O when stable pages are required v2 Christoph Hellwig
  2026-01-19  7:44   ` [PATCH 01/14] block: refactor get_contig_folio_len Christoph Hellwig
@ 2026-01-19  7:44   ` Christoph Hellwig
  2026-01-22 11:04     ` Johannes Thumshirn
                       ` (3 more replies)
  2026-01-19  7:44   ` [PATCH 03/14] iov_iter: extract a iov_iter_extract_bvecs helper from bio code Christoph Hellwig
                     ` (13 subsequent siblings)
  15 siblings, 4 replies; 75+ messages in thread
From: Christoph Hellwig @ 2026-01-19  7:44 UTC (permalink / raw)
  To: Jens Axboe, Christian Brauner
  Cc: Darrick J. Wong, Carlos Maiolino, Qu Wenruo, Al Viro, linux-block,
	linux-xfs, linux-fsdevel

bio_add_page fails to add data to the bio when mixing P2P with non-P2P
ranges, or ranges that map to different P2P providers.  In that case
it will trigger that WARN_ON and return an error up the chain instead of
simply starting a new bio as intended.  Fix this by open coding
bio_add_page and handling this case explicitly.  While doing so, stop
merging physical contiguous data that belongs to multiple folios.  While
this merge could lead to more efficient bio packing in some case,
dropping will allow to remove handling of this corner case in other
places and make the code more robust.

Signed-off-by: Christoph Hellwig <hch@lst.de>
---
 block/bio.c | 37 +++++++++++++------------------------
 1 file changed, 13 insertions(+), 24 deletions(-)

diff --git a/block/bio.c b/block/bio.c
index 18dfdaba0c73..46ff33f4de04 100644
--- a/block/bio.c
+++ b/block/bio.c
@@ -1216,7 +1216,7 @@ static unsigned int get_contig_folio_len(struct page **pages,
  * For a multi-segment *iter, this function only adds pages from the next
  * non-empty segment of the iov iterator.
  */
-static int __bio_iov_iter_get_pages(struct bio *bio, struct iov_iter *iter)
+static ssize_t __bio_iov_iter_get_pages(struct bio *bio, struct iov_iter *iter)
 {
 	iov_iter_extraction_t extraction_flags = 0;
 	unsigned short nr_pages = bio->bi_max_vecs - bio->bi_vcnt;
@@ -1226,7 +1226,6 @@ static int __bio_iov_iter_get_pages(struct bio *bio, struct iov_iter *iter)
 	ssize_t size;
 	unsigned int i = 0;
 	size_t offset, left, len;
-	int ret = 0;
 
 	/*
 	 * Move page array up in the allocated memory for the bio vecs as far as
@@ -1247,37 +1246,26 @@ static int __bio_iov_iter_get_pages(struct bio *bio, struct iov_iter *iter)
 
 	nr_pages = DIV_ROUND_UP(offset + size, PAGE_SIZE);
 	for (left = size; left > 0; left -= len) {
-		unsigned int old_vcnt = bio->bi_vcnt;
 		unsigned int nr_to_add;
 
-		len = get_contig_folio_len(&pages[i], &nr_to_add, left, offset);
-		if (!bio_add_page(bio, pages[i], len, offset)) {
-			WARN_ON_ONCE(1);
-			ret = -EINVAL;
-			goto out;
-		}
+		if (bio->bi_vcnt > 0) {
+			struct bio_vec *prev = &bio->bi_io_vec[bio->bi_vcnt - 1];
 
-		if (bio_flagged(bio, BIO_PAGE_PINNED)) {
-			/*
-			 * We're adding another fragment of a page that already
-			 * was part of the last segment.  Undo our pin as the
-			 * page was pinned when an earlier fragment of it was
-			 * added to the bio and __bio_release_pages expects a
-			 * single pin per page.
-			 */
-			if (offset && bio->bi_vcnt == old_vcnt)
-				unpin_user_folio(page_folio(pages[i]), 1);
+			if (!zone_device_pages_have_same_pgmap(prev->bv_page,
+					pages[i]))
+				break;
 		}
+
+		len = get_contig_folio_len(&pages[i], &nr_to_add, left, offset);
+		__bio_add_page(bio, pages[i], len, offset);
 		i += nr_to_add;
 		offset = 0;
 	}
 
 	iov_iter_revert(iter, left);
-out:
 	while (i < nr_pages)
 		bio_release_page(bio, pages[i++]);
-
-	return ret;
+	return size - left;
 }
 
 /*
@@ -1337,7 +1325,7 @@ static int bio_iov_iter_align_down(struct bio *bio, struct iov_iter *iter,
 int bio_iov_iter_get_pages(struct bio *bio, struct iov_iter *iter,
 			   unsigned len_align_mask)
 {
-	int ret = 0;
+	ssize_t ret;
 
 	if (WARN_ON_ONCE(bio_flagged(bio, BIO_CLONED)))
 		return -EIO;
@@ -1350,9 +1338,10 @@ int bio_iov_iter_get_pages(struct bio *bio, struct iov_iter *iter,
 
 	if (iov_iter_extract_will_pin(iter))
 		bio_set_flag(bio, BIO_PAGE_PINNED);
+
 	do {
 		ret = __bio_iov_iter_get_pages(bio, iter);
-	} while (!ret && iov_iter_count(iter) && !bio_full(bio, 0));
+	} while (ret > 0 && iov_iter_count(iter) && !bio_full(bio, 0));
 
 	if (bio->bi_vcnt)
 		return bio_iov_iter_align_down(bio, iter, len_align_mask);
-- 
2.47.3


^ permalink raw reply related	[flat|nested] 75+ messages in thread

* [PATCH 03/14] iov_iter: extract a iov_iter_extract_bvecs helper from bio code
  2026-01-19  7:44 ` bounce buffer direct I/O when stable pages are required v2 Christoph Hellwig
  2026-01-19  7:44   ` [PATCH 01/14] block: refactor get_contig_folio_len Christoph Hellwig
  2026-01-19  7:44   ` [PATCH 02/14] block: open code bio_add_page and fix handling of mismatching P2P ranges Christoph Hellwig
@ 2026-01-19  7:44   ` Christoph Hellwig
  2026-01-22 17:47     ` Darrick J. Wong
  2026-01-23 11:37     ` David Howells
  2026-01-19  7:44   ` [PATCH 04/14] block: remove bio_release_page Christoph Hellwig
                     ` (12 subsequent siblings)
  15 siblings, 2 replies; 75+ messages in thread
From: Christoph Hellwig @ 2026-01-19  7:44 UTC (permalink / raw)
  To: Jens Axboe, Christian Brauner
  Cc: Darrick J. Wong, Carlos Maiolino, Qu Wenruo, Al Viro, linux-block,
	linux-xfs, linux-fsdevel

Massage __bio_iov_iter_get_pages so that it doesn't need the bio, and
move it to lib/iov_iter.c so that it can be used by block code for
other things than filling a bio and by other subsystems like netfs.

Signed-off-by: Christoph Hellwig <hch@lst.de>
---
 block/bio.c         | 120 +++++++-------------------------------------
 include/linux/uio.h |   3 ++
 lib/iov_iter.c      |  98 ++++++++++++++++++++++++++++++++++++
 3 files changed, 119 insertions(+), 102 deletions(-)

diff --git a/block/bio.c b/block/bio.c
index 46ff33f4de04..12cd3c5f6d6d 100644
--- a/block/bio.c
+++ b/block/bio.c
@@ -1172,102 +1172,6 @@ void bio_iov_bvec_set(struct bio *bio, const struct iov_iter *iter)
 	bio_set_flag(bio, BIO_CLONED);
 }
 
-static unsigned int get_contig_folio_len(struct page **pages,
-					 unsigned int *num_pages, size_t left,
-					 size_t offset)
-{
-	struct folio *folio = page_folio(pages[0]);
-	size_t contig_sz = min_t(size_t, PAGE_SIZE - offset, left);
-	unsigned int max_pages, i;
-	size_t folio_offset, len;
-
-	folio_offset = PAGE_SIZE * folio_page_idx(folio, pages[0]) + offset;
-	len = min(folio_size(folio) - folio_offset, left);
-
-	/*
-	 * We might COW a single page in the middle of a large folio, so we have
-	 * to check that all pages belong to the same folio.
-	 */
-	left -= contig_sz;
-	max_pages = DIV_ROUND_UP(offset + len, PAGE_SIZE);
-	for (i = 1; i < max_pages; i++) {
-		size_t next = min_t(size_t, PAGE_SIZE, left);
-
-		if (page_folio(pages[i]) != folio ||
-		    pages[i] != pages[i - 1] + 1)
-			break;
-		contig_sz += next;
-		left -= next;
-	}
-
-	*num_pages = i;
-	return contig_sz;
-}
-
-#define PAGE_PTRS_PER_BVEC     (sizeof(struct bio_vec) / sizeof(struct page *))
-
-/**
- * __bio_iov_iter_get_pages - pin user or kernel pages and add them to a bio
- * @bio: bio to add pages to
- * @iter: iov iterator describing the region to be mapped
- *
- * Extracts pages from *iter and appends them to @bio's bvec array.  The pages
- * will have to be cleaned up in the way indicated by the BIO_PAGE_PINNED flag.
- * For a multi-segment *iter, this function only adds pages from the next
- * non-empty segment of the iov iterator.
- */
-static ssize_t __bio_iov_iter_get_pages(struct bio *bio, struct iov_iter *iter)
-{
-	iov_iter_extraction_t extraction_flags = 0;
-	unsigned short nr_pages = bio->bi_max_vecs - bio->bi_vcnt;
-	unsigned short entries_left = bio->bi_max_vecs - bio->bi_vcnt;
-	struct bio_vec *bv = bio->bi_io_vec + bio->bi_vcnt;
-	struct page **pages = (struct page **)bv;
-	ssize_t size;
-	unsigned int i = 0;
-	size_t offset, left, len;
-
-	/*
-	 * Move page array up in the allocated memory for the bio vecs as far as
-	 * possible so that we can start filling biovecs from the beginning
-	 * without overwriting the temporary page array.
-	 */
-	BUILD_BUG_ON(PAGE_PTRS_PER_BVEC < 2);
-	pages += entries_left * (PAGE_PTRS_PER_BVEC - 1);
-
-	if (bio->bi_bdev && blk_queue_pci_p2pdma(bio->bi_bdev->bd_disk->queue))
-		extraction_flags |= ITER_ALLOW_P2PDMA;
-
-	size = iov_iter_extract_pages(iter, &pages,
-				      UINT_MAX - bio->bi_iter.bi_size,
-				      nr_pages, extraction_flags, &offset);
-	if (unlikely(size <= 0))
-		return size ? size : -EFAULT;
-
-	nr_pages = DIV_ROUND_UP(offset + size, PAGE_SIZE);
-	for (left = size; left > 0; left -= len) {
-		unsigned int nr_to_add;
-
-		if (bio->bi_vcnt > 0) {
-			struct bio_vec *prev = &bio->bi_io_vec[bio->bi_vcnt - 1];
-
-			if (!zone_device_pages_have_same_pgmap(prev->bv_page,
-					pages[i]))
-				break;
-		}
-
-		len = get_contig_folio_len(&pages[i], &nr_to_add, left, offset);
-		__bio_add_page(bio, pages[i], len, offset);
-		i += nr_to_add;
-		offset = 0;
-	}
-
-	iov_iter_revert(iter, left);
-	while (i < nr_pages)
-		bio_release_page(bio, pages[i++]);
-	return size - left;
-}
-
 /*
  * Aligns the bio size to the len_align_mask, releasing excessive bio vecs that
  * __bio_iov_iter_get_pages may have inserted, and reverts the trimmed length
@@ -1325,7 +1229,7 @@ static int bio_iov_iter_align_down(struct bio *bio, struct iov_iter *iter,
 int bio_iov_iter_get_pages(struct bio *bio, struct iov_iter *iter,
 			   unsigned len_align_mask)
 {
-	ssize_t ret;
+	iov_iter_extraction_t flags = 0;
 
 	if (WARN_ON_ONCE(bio_flagged(bio, BIO_CLONED)))
 		return -EIO;
@@ -1338,14 +1242,26 @@ int bio_iov_iter_get_pages(struct bio *bio, struct iov_iter *iter,
 
 	if (iov_iter_extract_will_pin(iter))
 		bio_set_flag(bio, BIO_PAGE_PINNED);
+	if (bio->bi_bdev && blk_queue_pci_p2pdma(bio->bi_bdev->bd_disk->queue))
+		flags |= ITER_ALLOW_P2PDMA;
 
 	do {
-		ret = __bio_iov_iter_get_pages(bio, iter);
-	} while (ret > 0 && iov_iter_count(iter) && !bio_full(bio, 0));
+		ssize_t ret;
+
+		ret = iov_iter_extract_bvecs(iter, bio->bi_io_vec,
+				UINT_MAX - bio->bi_iter.bi_size, &bio->bi_vcnt,
+				bio->bi_max_vecs, flags);
+		if (ret <= 0) {
+			if (!bio->bi_vcnt)
+				return ret;
+			break;
+		}
+		bio->bi_iter.bi_size += ret;
+	} while (iov_iter_count(iter) && !bio_full(bio, 0));
 
-	if (bio->bi_vcnt)
-		return bio_iov_iter_align_down(bio, iter, len_align_mask);
-	return ret;
+	if (is_pci_p2pdma_page(bio->bi_io_vec->bv_page))
+		bio->bi_opf |= REQ_NOMERGE;
+	return bio_iov_iter_align_down(bio, iter, len_align_mask);
 }
 
 static void submit_bio_wait_endio(struct bio *bio)
diff --git a/include/linux/uio.h b/include/linux/uio.h
index 5b127043a151..a9bc5b3067e3 100644
--- a/include/linux/uio.h
+++ b/include/linux/uio.h
@@ -389,6 +389,9 @@ ssize_t iov_iter_extract_pages(struct iov_iter *i, struct page ***pages,
 			       size_t maxsize, unsigned int maxpages,
 			       iov_iter_extraction_t extraction_flags,
 			       size_t *offset0);
+ssize_t iov_iter_extract_bvecs(struct iov_iter *iter, struct bio_vec *bv,
+		size_t max_size, unsigned short *nr_vecs,
+		unsigned short max_vecs, iov_iter_extraction_t extraction_flags);
 
 /**
  * iov_iter_extract_will_pin - Indicate how pages from the iterator will be retained
diff --git a/lib/iov_iter.c b/lib/iov_iter.c
index 896760bad455..545250507f08 100644
--- a/lib/iov_iter.c
+++ b/lib/iov_iter.c
@@ -1845,3 +1845,101 @@ ssize_t iov_iter_extract_pages(struct iov_iter *i,
 	return -EFAULT;
 }
 EXPORT_SYMBOL_GPL(iov_iter_extract_pages);
+
+static unsigned int get_contig_folio_len(struct page **pages,
+		unsigned int *num_pages, size_t left, size_t offset)
+{
+	struct folio *folio = page_folio(pages[0]);
+	size_t contig_sz = min_t(size_t, PAGE_SIZE - offset, left);
+	unsigned int max_pages, i;
+	size_t folio_offset, len;
+
+	folio_offset = PAGE_SIZE * folio_page_idx(folio, pages[0]) + offset;
+	len = min(folio_size(folio) - folio_offset, left);
+
+	/*
+	 * We might COW a single page in the middle of a large folio, so we have
+	 * to check that all pages belong to the same folio.
+	 */
+	left -= contig_sz;
+	max_pages = DIV_ROUND_UP(offset + len, PAGE_SIZE);
+	for (i = 1; i < max_pages; i++) {
+		size_t next = min_t(size_t, PAGE_SIZE, left);
+
+		if (page_folio(pages[i]) != folio ||
+		    pages[i] != pages[i - 1] + 1)
+			break;
+		contig_sz += next;
+		left -= next;
+	}
+
+	*num_pages = i;
+	return contig_sz;
+}
+
+#define PAGE_PTRS_PER_BVEC     (sizeof(struct bio_vec) / sizeof(struct page *))
+
+/**
+ * iov_iter_extract_bvecs - Extract bvecs from an iterator
+ * @iter:	the iterator to extract from
+ * @bv:		bvec return array
+ * @max_size:	maximum size to extract from @iter
+ * @nr_vecs:	number of vectors in @bv (on in and output)
+ * @max_vecs:	maximum vectors in @bv, including those filled before calling
+ * @extraction_flags: flags to qualify request
+ *
+ * Like iov_iter_extract_pages(), but returns physically contiguous ranges
+ * contained in a single folio as a single bvec instead of multiple entries.
+ *
+ * Returns the number of bytes extracted when successful, or a negative errno.
+ * If @nr_vecs was non-zero on entry, the number of successfully extracted bytes
+ * can be 0.
+ */
+ssize_t iov_iter_extract_bvecs(struct iov_iter *iter, struct bio_vec *bv,
+		size_t max_size, unsigned short *nr_vecs,
+		unsigned short max_vecs, iov_iter_extraction_t extraction_flags)
+{
+	unsigned short entries_left = max_vecs - *nr_vecs;
+	unsigned short nr_pages, i = 0;
+	size_t left, offset, len;
+	struct page **pages;
+	ssize_t size;
+
+	/*
+	 * Move page array up in the allocated memory for the bio vecs as far as
+	 * possible so that we can start filling biovecs from the beginning
+	 * without overwriting the temporary page array.
+	 */
+	BUILD_BUG_ON(PAGE_PTRS_PER_BVEC < 2);
+	pages = (struct page **)(bv + *nr_vecs) +
+		entries_left * (PAGE_PTRS_PER_BVEC - 1);
+
+	size = iov_iter_extract_pages(iter, &pages, max_size, entries_left,
+			extraction_flags, &offset);
+	if (unlikely(size <= 0))
+		return size ? size : -EFAULT;
+
+	nr_pages = DIV_ROUND_UP(offset + size, PAGE_SIZE);
+	for (left = size; left > 0; left -= len) {
+		unsigned int nr_to_add;
+
+		if (*nr_vecs > 0 &&
+		    !zone_device_pages_have_same_pgmap(bv[*nr_vecs - 1].bv_page,
+				pages[i]))
+			break;
+
+		len = get_contig_folio_len(&pages[i], &nr_to_add, left, offset);
+		bvec_set_page(&bv[*nr_vecs], pages[i], len, offset);
+		i += nr_to_add;
+		(*nr_vecs)++;
+		offset = 0;
+	}
+
+	iov_iter_revert(iter, left);
+	if (iov_iter_extract_will_pin(iter)) {
+		while (i < nr_pages)
+			unpin_user_page(pages[i++]);
+	}
+	return size - left;
+}
+EXPORT_SYMBOL_GPL(iov_iter_extract_bvecs);
-- 
2.47.3


^ permalink raw reply related	[flat|nested] 75+ messages in thread

* [PATCH 04/14] block: remove bio_release_page
  2026-01-19  7:44 ` bounce buffer direct I/O when stable pages are required v2 Christoph Hellwig
                     ` (2 preceding siblings ...)
  2026-01-19  7:44   ` [PATCH 03/14] iov_iter: extract a iov_iter_extract_bvecs helper from bio code Christoph Hellwig
@ 2026-01-19  7:44   ` Christoph Hellwig
  2026-01-22 11:14     ` Johannes Thumshirn
                       ` (3 more replies)
  2026-01-19  7:44   ` [PATCH 05/14] block: add helpers to bounce buffer an iov_iter into bios Christoph Hellwig
                     ` (11 subsequent siblings)
  15 siblings, 4 replies; 75+ messages in thread
From: Christoph Hellwig @ 2026-01-19  7:44 UTC (permalink / raw)
  To: Jens Axboe, Christian Brauner
  Cc: Darrick J. Wong, Carlos Maiolino, Qu Wenruo, Al Viro, linux-block,
	linux-xfs, linux-fsdevel

Merge bio_release_page into the only remaining caller.

Signed-off-by: Christoph Hellwig <hch@lst.de>
---
 block/bio.c |  4 +++-
 block/blk.h | 11 -----------
 2 files changed, 3 insertions(+), 12 deletions(-)

diff --git a/block/bio.c b/block/bio.c
index 12cd3c5f6d6d..c51b4e2470e2 100644
--- a/block/bio.c
+++ b/block/bio.c
@@ -1195,7 +1195,9 @@ static int bio_iov_iter_align_down(struct bio *bio, struct iov_iter *iter,
 			break;
 		}
 
-		bio_release_page(bio, bv->bv_page);
+		if (bio_flagged(bio, BIO_PAGE_PINNED))
+			unpin_user_page(bv->bv_page);
+
 		bio->bi_vcnt--;
 		nbytes -= bv->bv_len;
 	} while (nbytes);
diff --git a/block/blk.h b/block/blk.h
index 980eef1f5690..886238cae5f1 100644
--- a/block/blk.h
+++ b/block/blk.h
@@ -595,17 +595,6 @@ void bdev_set_nr_sectors(struct block_device *bdev, sector_t sectors);
 
 struct gendisk *__alloc_disk_node(struct request_queue *q, int node_id,
 		struct lock_class_key *lkclass);
-
-/*
- * Clean up a page appropriately, where the page may be pinned, may have a
- * ref taken on it or neither.
- */
-static inline void bio_release_page(struct bio *bio, struct page *page)
-{
-	if (bio_flagged(bio, BIO_PAGE_PINNED))
-		unpin_user_page(page);
-}
-
 struct request_queue *blk_alloc_queue(struct queue_limits *lim, int node_id);
 
 int disk_scan_partitions(struct gendisk *disk, blk_mode_t mode);
-- 
2.47.3


^ permalink raw reply related	[flat|nested] 75+ messages in thread

* [PATCH 05/14] block: add helpers to bounce buffer an iov_iter into bios
  2026-01-19  7:44 ` bounce buffer direct I/O when stable pages are required v2 Christoph Hellwig
                     ` (3 preceding siblings ...)
  2026-01-19  7:44   ` [PATCH 04/14] block: remove bio_release_page Christoph Hellwig
@ 2026-01-19  7:44   ` Christoph Hellwig
  2026-01-22 13:05     ` Johannes Thumshirn
                       ` (3 more replies)
  2026-01-19  7:44   ` [PATCH 06/14] iomap: fix submission side handling of completion side errors Christoph Hellwig
                     ` (10 subsequent siblings)
  15 siblings, 4 replies; 75+ messages in thread
From: Christoph Hellwig @ 2026-01-19  7:44 UTC (permalink / raw)
  To: Jens Axboe, Christian Brauner
  Cc: Darrick J. Wong, Carlos Maiolino, Qu Wenruo, Al Viro, linux-block,
	linux-xfs, linux-fsdevel

Add helpers to implement bounce buffering of data into a bio to implement
direct I/O for cases where direct user access is not possible because
stable in-flight data is required.  These are intended to be used as
easily as bio_iov_iter_get_pages for the zero-copy path.

The write side is trivial and just copies data into the bounce buffer.
The read side is a lot more complex because it needs to perform the copy
from the completion context, and without preserving the iov_iter through
the call chain.  It steals a trick from the integrity data user interface
and uses the first vector in the bio for the bounce buffer data that is
fed to the block I/O stack, and uses the others to record the user
buffer fragments.

Signed-off-by: Christoph Hellwig <hch@lst.de>
---
 block/bio.c         | 178 ++++++++++++++++++++++++++++++++++++++++++++
 include/linux/bio.h |  26 +++++++
 2 files changed, 204 insertions(+)

diff --git a/block/bio.c b/block/bio.c
index c51b4e2470e2..da795b1df52a 100644
--- a/block/bio.c
+++ b/block/bio.c
@@ -1266,6 +1266,184 @@ int bio_iov_iter_get_pages(struct bio *bio, struct iov_iter *iter,
 	return bio_iov_iter_align_down(bio, iter, len_align_mask);
 }
 
+static struct folio *folio_alloc_greedy(gfp_t gfp, size_t *size)
+{
+	struct folio *folio;
+
+	while (*size > PAGE_SIZE) {
+		folio = folio_alloc(gfp | __GFP_NORETRY, get_order(*size));
+		if (folio)
+			return folio;
+		*size = rounddown_pow_of_two(*size - 1);
+	}
+
+	return folio_alloc(gfp, get_order(*size));
+}
+
+static void bio_free_folios(struct bio *bio)
+{
+	struct bio_vec *bv;
+	int i;
+
+	bio_for_each_bvec_all(bv, bio, i) {
+		struct folio *folio = page_folio(bv->bv_page);
+
+		if (!is_zero_folio(folio))
+			folio_put(page_folio(bv->bv_page));
+	}
+}
+
+static int bio_iov_iter_bounce_write(struct bio *bio, struct iov_iter *iter)
+{
+	size_t total_len = iov_iter_count(iter);
+
+	if (WARN_ON_ONCE(bio_flagged(bio, BIO_CLONED)))
+		return -EINVAL;
+	if (WARN_ON_ONCE(bio->bi_iter.bi_size))
+		return -EINVAL;
+	if (WARN_ON_ONCE(bio->bi_vcnt >= bio->bi_max_vecs))
+		return -EINVAL;
+
+	do {
+		size_t this_len = min(total_len, SZ_1M);
+		struct folio *folio;
+
+		if (this_len > PAGE_SIZE * 2)
+			this_len = rounddown_pow_of_two(this_len);
+
+		if (bio->bi_iter.bi_size > UINT_MAX - this_len)
+			break;
+
+		folio = folio_alloc_greedy(GFP_KERNEL, &this_len);
+		if (!folio)
+			break;
+		bio_add_folio_nofail(bio, folio, this_len, 0);
+
+		if (copy_from_iter(folio_address(folio), this_len, iter) !=
+				this_len) {
+			bio_free_folios(bio);
+			return -EFAULT;
+		}
+
+		total_len -= this_len;
+	} while (total_len && bio->bi_vcnt < bio->bi_max_vecs);
+
+	if (!bio->bi_iter.bi_size)
+		return -ENOMEM;
+	return 0;
+}
+
+static int bio_iov_iter_bounce_read(struct bio *bio, struct iov_iter *iter)
+{
+	size_t len = min(iov_iter_count(iter), SZ_1M);
+	struct folio *folio;
+
+	folio = folio_alloc_greedy(GFP_KERNEL, &len);
+	if (!folio)
+		return -ENOMEM;
+
+	do {
+		ssize_t ret;
+
+		ret = iov_iter_extract_bvecs(iter, bio->bi_io_vec + 1, len,
+				&bio->bi_vcnt, bio->bi_max_vecs - 1, 0);
+		if (ret <= 0) {
+			if (!bio->bi_vcnt)
+				return ret;
+			break;
+		}
+		len -= ret;
+		bio->bi_iter.bi_size += ret;
+	} while (len && bio->bi_vcnt < bio->bi_max_vecs - 1);
+
+	/*
+	 * Set the folio directly here.  The above loop has already calculated
+	 * the correct bi_size, and we use bi_vcnt for the user buffers.  That
+	 * is safe as bi_vcnt is only for user by the submitter and not looked
+	 * at by the actual I/O path.
+	 */
+	bvec_set_folio(&bio->bi_io_vec[0], folio, bio->bi_iter.bi_size, 0);
+	if (iov_iter_extract_will_pin(iter))
+		bio_set_flag(bio, BIO_PAGE_PINNED);
+	return 0;
+}
+
+/**
+ * bio_iov_iter_bounce - bounce buffer data from an iter into a bio
+ * @bio:	bio to send
+ * @iter:	iter to read from / write into
+ *
+ * Helper for direct I/O implementations that need to bounce buffer because
+ * we need to checksum the data or perform other operations that require
+ * consistency.  Allocates folios to back the bounce buffer, and for writes
+ * copies the data into it.  Needs to be paired with bio_iov_iter_unbounce()
+ * called on completion.
+ */
+int bio_iov_iter_bounce(struct bio *bio, struct iov_iter *iter)
+{
+	if (op_is_write(bio_op(bio)))
+		return bio_iov_iter_bounce_write(bio, iter);
+	return bio_iov_iter_bounce_read(bio, iter);
+}
+
+static void bvec_unpin(struct bio_vec *bv, bool mark_dirty)
+{
+	struct folio *folio = page_folio(bv->bv_page);
+	size_t nr_pages = (bv->bv_offset + bv->bv_len - 1) / PAGE_SIZE -
+			bv->bv_offset / PAGE_SIZE + 1;
+
+	if (mark_dirty)
+		folio_mark_dirty_lock(folio);
+	unpin_user_folio(folio, nr_pages);
+}
+
+static void bio_iov_iter_unbounce_read(struct bio *bio, bool is_error,
+		bool mark_dirty)
+{
+	unsigned int len = bio->bi_io_vec[0].bv_len;
+
+	if (likely(!is_error)) {
+		void *buf = bvec_virt(&bio->bi_io_vec[0]);
+		struct iov_iter to;
+
+		iov_iter_bvec(&to, ITER_DEST, bio->bi_io_vec + 1, bio->bi_vcnt,
+				len);
+		WARN_ON_ONCE(copy_to_iter(buf, len, &to) != len);
+	} else {
+		/* No need to mark folios dirty if never copied to them */
+		mark_dirty = false;
+	}
+
+	if (bio_flagged(bio, BIO_PAGE_PINNED)) {
+		int i;
+
+		for (i = 0; i < bio->bi_vcnt; i++)
+			bvec_unpin(&bio->bi_io_vec[1 + i], mark_dirty);
+	}
+
+	folio_put(page_folio(bio->bi_io_vec[0].bv_page));
+}
+
+/**
+ * bio_iov_iter_unbounce - finish a bounce buffer operation
+ * @bio:	completed bio
+ * @is_error:	%true if an I/O error occurred and data should not be copied
+ * @mark_dirty:	If %true, folios will be marked dirty.
+ *
+ * Helper for direct I/O implementations that need to bounce buffer because
+ * we need to checksum the data or perform other operations that require
+ * consistency.  Called to complete a bio set up by bio_iov_iter_bounce().
+ * Copies data back for reads, and marks the original folios dirty if
+ * requested and then frees the bounce buffer.
+ */
+void bio_iov_iter_unbounce(struct bio *bio, bool is_error, bool mark_dirty)
+{
+	if (op_is_write(bio_op(bio)))
+		bio_free_folios(bio);
+	else
+		bio_iov_iter_unbounce_read(bio, is_error, mark_dirty);
+}
+
 static void submit_bio_wait_endio(struct bio *bio)
 {
 	complete(bio->bi_private);
diff --git a/include/linux/bio.h b/include/linux/bio.h
index c75a9b3672aa..95cfc79b88b8 100644
--- a/include/linux/bio.h
+++ b/include/linux/bio.h
@@ -403,6 +403,29 @@ static inline int bio_iov_vecs_to_alloc(struct iov_iter *iter, int max_segs)
 	return iov_iter_npages(iter, max_segs);
 }
 
+/**
+ * bio_iov_bounce_nr_vecs - calculate number of bvecs for a bounce bio
+ * @iter:	iter to bounce from
+ * @op:		REQ_OP_* for the bio
+ *
+ * Calculates how many bvecs are needed for the next bio to bounce from/to
+ * @iter.
+ */
+static inline unsigned short
+bio_iov_bounce_nr_vecs(struct iov_iter *iter, blk_opf_t op)
+{
+	/*
+	 * We still need to bounce bvec iters, so don't special case them
+	 * here unlike in bio_iov_vecs_to_alloc.
+	 *
+	 * For reads we need to use a vector for the bounce buffer, account
+	 * for that here.
+	 */
+	if (op_is_write(op))
+		return iov_iter_npages(iter, BIO_MAX_VECS);
+	return iov_iter_npages(iter, BIO_MAX_VECS - 1) + 1;
+}
+
 struct request_queue;
 
 void bio_init(struct bio *bio, struct block_device *bdev, struct bio_vec *table,
@@ -456,6 +479,9 @@ void __bio_release_pages(struct bio *bio, bool mark_dirty);
 extern void bio_set_pages_dirty(struct bio *bio);
 extern void bio_check_pages_dirty(struct bio *bio);
 
+int bio_iov_iter_bounce(struct bio *bio, struct iov_iter *iter);
+void bio_iov_iter_unbounce(struct bio *bio, bool is_error, bool mark_dirty);
+
 extern void bio_copy_data_iter(struct bio *dst, struct bvec_iter *dst_iter,
 			       struct bio *src, struct bvec_iter *src_iter);
 extern void bio_copy_data(struct bio *dst, struct bio *src);
-- 
2.47.3


^ permalink raw reply related	[flat|nested] 75+ messages in thread

* [PATCH 06/14] iomap: fix submission side handling of completion side errors
  2026-01-19  7:44 ` bounce buffer direct I/O when stable pages are required v2 Christoph Hellwig
                     ` (4 preceding siblings ...)
  2026-01-19  7:44   ` [PATCH 05/14] block: add helpers to bounce buffer an iov_iter into bios Christoph Hellwig
@ 2026-01-19  7:44   ` Christoph Hellwig
  2026-01-19 17:40     ` Darrick J. Wong
  2026-01-23  8:54     ` Damien Le Moal
  2026-01-19  7:44   ` [PATCH 07/14] iomap: simplify iomap_dio_bio_iter Christoph Hellwig
                     ` (9 subsequent siblings)
  15 siblings, 2 replies; 75+ messages in thread
From: Christoph Hellwig @ 2026-01-19  7:44 UTC (permalink / raw)
  To: Jens Axboe, Christian Brauner
  Cc: Darrick J. Wong, Carlos Maiolino, Qu Wenruo, Al Viro, linux-block,
	linux-xfs, linux-fsdevel

The "if (dio->error)" in iomap_dio_bio_iter exists to stop submitting
more bios when a completion already return an error.  Commit cfe057f7db1f
("iomap_dio_actor(): fix iov_iter bugs") made it revert the iov by
"copied", which is very wrong given that we've already consumed that
range and submitted a bio for it.

Fixes: cfe057f7db1f ("iomap_dio_actor(): fix iov_iter bugs")
Signed-off-by: Christoph Hellwig <hch@lst.de>
---
 fs/iomap/direct-io.c | 10 +++++++---
 1 file changed, 7 insertions(+), 3 deletions(-)

diff --git a/fs/iomap/direct-io.c b/fs/iomap/direct-io.c
index 4000c8596d9b..867c0ac6df8f 100644
--- a/fs/iomap/direct-io.c
+++ b/fs/iomap/direct-io.c
@@ -443,9 +443,13 @@ static int iomap_dio_bio_iter(struct iomap_iter *iter, struct iomap_dio *dio)
 	nr_pages = bio_iov_vecs_to_alloc(dio->submit.iter, BIO_MAX_VECS);
 	do {
 		size_t n;
-		if (dio->error) {
-			iov_iter_revert(dio->submit.iter, copied);
-			copied = ret = 0;
+
+		/*
+		 * If completions already occurred and reported errors, give up now and
+		 * don't bother submitting more bios.
+		 */
+		if (unlikely(data_race(dio->error))) {
+			ret = 0;
 			goto out;
 		}
 
-- 
2.47.3


^ permalink raw reply related	[flat|nested] 75+ messages in thread

* [PATCH 07/14] iomap: simplify iomap_dio_bio_iter
  2026-01-19  7:44 ` bounce buffer direct I/O when stable pages are required v2 Christoph Hellwig
                     ` (5 preceding siblings ...)
  2026-01-19  7:44   ` [PATCH 06/14] iomap: fix submission side handling of completion side errors Christoph Hellwig
@ 2026-01-19  7:44   ` Christoph Hellwig
  2026-01-19 17:43     ` Darrick J. Wong
  2026-01-23  8:55     ` Damien Le Moal
  2026-01-19  7:44   ` [PATCH 08/14] iomap: split out the per-bio logic from iomap_dio_bio_iter Christoph Hellwig
                     ` (8 subsequent siblings)
  15 siblings, 2 replies; 75+ messages in thread
From: Christoph Hellwig @ 2026-01-19  7:44 UTC (permalink / raw)
  To: Jens Axboe, Christian Brauner
  Cc: Darrick J. Wong, Carlos Maiolino, Qu Wenruo, Al Viro, linux-block,
	linux-xfs, linux-fsdevel

Use iov_iter_count to check if we need to continue as that just reads
a field in the iov_iter, and only use bio_iov_vecs_to_alloc to calculate
the actual number of vectors to allocate for the bio.

Signed-off-by: Christoph Hellwig <hch@lst.de>
---
 fs/iomap/direct-io.c | 13 ++++++-------
 1 file changed, 6 insertions(+), 7 deletions(-)

diff --git a/fs/iomap/direct-io.c b/fs/iomap/direct-io.c
index 867c0ac6df8f..de03bc7cf4ed 100644
--- a/fs/iomap/direct-io.c
+++ b/fs/iomap/direct-io.c
@@ -312,7 +312,7 @@ static int iomap_dio_bio_iter(struct iomap_iter *iter, struct iomap_dio *dio)
 	blk_opf_t bio_opf = REQ_SYNC | REQ_IDLE;
 	struct bio *bio;
 	bool need_zeroout = false;
-	int nr_pages, ret = 0;
+	int ret = 0;
 	u64 copied = 0;
 	size_t orig_count;
 	unsigned int alignment;
@@ -440,7 +440,6 @@ static int iomap_dio_bio_iter(struct iomap_iter *iter, struct iomap_dio *dio)
 			goto out;
 	}
 
-	nr_pages = bio_iov_vecs_to_alloc(dio->submit.iter, BIO_MAX_VECS);
 	do {
 		size_t n;
 
@@ -453,7 +452,9 @@ static int iomap_dio_bio_iter(struct iomap_iter *iter, struct iomap_dio *dio)
 			goto out;
 		}
 
-		bio = iomap_dio_alloc_bio(iter, dio, nr_pages, bio_opf);
+		bio = iomap_dio_alloc_bio(iter, dio,
+				bio_iov_vecs_to_alloc(dio->submit.iter,
+						BIO_MAX_VECS), bio_opf);
 		fscrypt_set_bio_crypt_ctx(bio, inode, pos >> inode->i_blkbits,
 					  GFP_KERNEL);
 		bio->bi_iter.bi_sector = iomap_sector(iomap, pos);
@@ -495,16 +496,14 @@ static int iomap_dio_bio_iter(struct iomap_iter *iter, struct iomap_dio *dio)
 		dio->size += n;
 		copied += n;
 
-		nr_pages = bio_iov_vecs_to_alloc(dio->submit.iter,
-						 BIO_MAX_VECS);
 		/*
 		 * We can only poll for single bio I/Os.
 		 */
-		if (nr_pages)
+		if (iov_iter_count(dio->submit.iter))
 			dio->iocb->ki_flags &= ~IOCB_HIPRI;
 		iomap_dio_submit_bio(iter, dio, bio, pos);
 		pos += n;
-	} while (nr_pages);
+	} while (iov_iter_count(dio->submit.iter));
 
 	/*
 	 * We need to zeroout the tail of a sub-block write if the extent type
-- 
2.47.3


^ permalink raw reply related	[flat|nested] 75+ messages in thread

* [PATCH 08/14] iomap: split out the per-bio logic from iomap_dio_bio_iter
  2026-01-19  7:44 ` bounce buffer direct I/O when stable pages are required v2 Christoph Hellwig
                     ` (6 preceding siblings ...)
  2026-01-19  7:44   ` [PATCH 07/14] iomap: simplify iomap_dio_bio_iter Christoph Hellwig
@ 2026-01-19  7:44   ` Christoph Hellwig
  2026-01-23  8:57     ` Damien Le Moal
  2026-01-19  7:44   ` [PATCH 09/14] iomap: share code between iomap_dio_bio_end_io and iomap_finish_ioend_direct Christoph Hellwig
                     ` (7 subsequent siblings)
  15 siblings, 1 reply; 75+ messages in thread
From: Christoph Hellwig @ 2026-01-19  7:44 UTC (permalink / raw)
  To: Jens Axboe, Christian Brauner
  Cc: Darrick J. Wong, Carlos Maiolino, Qu Wenruo, Al Viro, linux-block,
	linux-xfs, linux-fsdevel

Factor out a separate helper that builds and submits a single bio.

Signed-off-by: Christoph Hellwig <hch@lst.de>
Reviewed-by: "Darrick J. Wong" <djwong@kernel.org>
---
 fs/iomap/direct-io.c | 111 +++++++++++++++++++++++--------------------
 1 file changed, 59 insertions(+), 52 deletions(-)

diff --git a/fs/iomap/direct-io.c b/fs/iomap/direct-io.c
index de03bc7cf4ed..bb79519dec65 100644
--- a/fs/iomap/direct-io.c
+++ b/fs/iomap/direct-io.c
@@ -302,6 +302,56 @@ static int iomap_dio_zero(const struct iomap_iter *iter, struct iomap_dio *dio,
 	return 0;
 }
 
+static ssize_t iomap_dio_bio_iter_one(struct iomap_iter *iter,
+		struct iomap_dio *dio, loff_t pos, unsigned int alignment,
+		blk_opf_t op)
+{
+	struct bio *bio;
+	ssize_t ret;
+
+	bio = iomap_dio_alloc_bio(iter, dio,
+			bio_iov_vecs_to_alloc(dio->submit.iter, BIO_MAX_VECS),
+			op);
+	fscrypt_set_bio_crypt_ctx(bio, iter->inode,
+			pos >> iter->inode->i_blkbits, GFP_KERNEL);
+	bio->bi_iter.bi_sector = iomap_sector(&iter->iomap, pos);
+	bio->bi_write_hint = iter->inode->i_write_hint;
+	bio->bi_ioprio = dio->iocb->ki_ioprio;
+	bio->bi_private = dio;
+	bio->bi_end_io = iomap_dio_bio_end_io;
+
+	ret = bio_iov_iter_get_pages(bio, dio->submit.iter, alignment - 1);
+	if (unlikely(ret))
+		goto out_put_bio;
+	ret = bio->bi_iter.bi_size;
+
+	/*
+	 * An atomic write bio must cover the complete length.  If it doesn't,
+	 * error out.
+	 */
+	if ((op & REQ_ATOMIC) && WARN_ON_ONCE(ret != iomap_length(iter))) {
+		ret = -EINVAL;
+		goto out_put_bio;
+	}
+
+	if (dio->flags & IOMAP_DIO_WRITE)
+		task_io_account_write(ret);
+	else if (dio->flags & IOMAP_DIO_DIRTY)
+		bio_set_pages_dirty(bio);
+
+	/*
+	 * We can only poll for single bio I/Os.
+	 */
+	if (iov_iter_count(dio->submit.iter))
+		dio->iocb->ki_flags &= ~IOCB_HIPRI;
+	iomap_dio_submit_bio(iter, dio, bio, pos);
+	return ret;
+
+out_put_bio:
+	bio_put(bio);
+	return ret;
+}
+
 static int iomap_dio_bio_iter(struct iomap_iter *iter, struct iomap_dio *dio)
 {
 	const struct iomap *iomap = &iter->iomap;
@@ -310,12 +360,11 @@ static int iomap_dio_bio_iter(struct iomap_iter *iter, struct iomap_dio *dio)
 	const loff_t length = iomap_length(iter);
 	loff_t pos = iter->pos;
 	blk_opf_t bio_opf = REQ_SYNC | REQ_IDLE;
-	struct bio *bio;
 	bool need_zeroout = false;
-	int ret = 0;
 	u64 copied = 0;
 	size_t orig_count;
 	unsigned int alignment;
+	ssize_t ret = 0;
 
 	/*
 	 * File systems that write out of place and always allocate new blocks
@@ -441,68 +490,27 @@ static int iomap_dio_bio_iter(struct iomap_iter *iter, struct iomap_dio *dio)
 	}
 
 	do {
-		size_t n;
-
 		/*
 		 * If completions already occurred and reported errors, give up now and
 		 * don't bother submitting more bios.
 		 */
-		if (unlikely(data_race(dio->error))) {
-			ret = 0;
+		if (unlikely(data_race(dio->error)))
 			goto out;
-		}
 
-		bio = iomap_dio_alloc_bio(iter, dio,
-				bio_iov_vecs_to_alloc(dio->submit.iter,
-						BIO_MAX_VECS), bio_opf);
-		fscrypt_set_bio_crypt_ctx(bio, inode, pos >> inode->i_blkbits,
-					  GFP_KERNEL);
-		bio->bi_iter.bi_sector = iomap_sector(iomap, pos);
-		bio->bi_write_hint = inode->i_write_hint;
-		bio->bi_ioprio = dio->iocb->ki_ioprio;
-		bio->bi_private = dio;
-		bio->bi_end_io = iomap_dio_bio_end_io;
-
-		ret = bio_iov_iter_get_pages(bio, dio->submit.iter,
-					     alignment - 1);
-		if (unlikely(ret)) {
+		ret = iomap_dio_bio_iter_one(iter, dio, pos, alignment, bio_opf);
+		if (unlikely(ret < 0)) {
 			/*
 			 * We have to stop part way through an IO. We must fall
 			 * through to the sub-block tail zeroing here, otherwise
 			 * this short IO may expose stale data in the tail of
 			 * the block we haven't written data to.
 			 */
-			bio_put(bio);
-			goto zero_tail;
-		}
-
-		n = bio->bi_iter.bi_size;
-		if (WARN_ON_ONCE((bio_opf & REQ_ATOMIC) && n != length)) {
-			/*
-			 * An atomic write bio must cover the complete length,
-			 * which it doesn't, so error. We may need to zero out
-			 * the tail (complete FS block), similar to when
-			 * bio_iov_iter_get_pages() returns an error, above.
-			 */
-			ret = -EINVAL;
-			bio_put(bio);
-			goto zero_tail;
+			break;
 		}
-		if (dio->flags & IOMAP_DIO_WRITE)
-			task_io_account_write(n);
-		else if (dio->flags & IOMAP_DIO_DIRTY)
-			bio_set_pages_dirty(bio);
-
-		dio->size += n;
-		copied += n;
-
-		/*
-		 * We can only poll for single bio I/Os.
-		 */
-		if (iov_iter_count(dio->submit.iter))
-			dio->iocb->ki_flags &= ~IOCB_HIPRI;
-		iomap_dio_submit_bio(iter, dio, bio, pos);
-		pos += n;
+		dio->size += ret;
+		copied += ret;
+		pos += ret;
+		ret = 0;
 	} while (iov_iter_count(dio->submit.iter));
 
 	/*
@@ -511,7 +519,6 @@ static int iomap_dio_bio_iter(struct iomap_iter *iter, struct iomap_dio *dio)
 	 * the block tail in the latter case, we can expose stale data via mmap
 	 * reads of the EOF block.
 	 */
-zero_tail:
 	if (need_zeroout ||
 	    ((dio->flags & IOMAP_DIO_WRITE) && pos >= i_size_read(inode))) {
 		/* zero out from the end of the write to the end of the block */
-- 
2.47.3


^ permalink raw reply related	[flat|nested] 75+ messages in thread

* [PATCH 09/14] iomap: share code between iomap_dio_bio_end_io and iomap_finish_ioend_direct
  2026-01-19  7:44 ` bounce buffer direct I/O when stable pages are required v2 Christoph Hellwig
                     ` (7 preceding siblings ...)
  2026-01-19  7:44   ` [PATCH 08/14] iomap: split out the per-bio logic from iomap_dio_bio_iter Christoph Hellwig
@ 2026-01-19  7:44   ` Christoph Hellwig
  2026-01-23  8:58     ` Damien Le Moal
  2026-01-19  7:44   ` [PATCH 10/14] iomap: free the bio before completing the dio Christoph Hellwig
                     ` (6 subsequent siblings)
  15 siblings, 1 reply; 75+ messages in thread
From: Christoph Hellwig @ 2026-01-19  7:44 UTC (permalink / raw)
  To: Jens Axboe, Christian Brauner
  Cc: Darrick J. Wong, Carlos Maiolino, Qu Wenruo, Al Viro, linux-block,
	linux-xfs, linux-fsdevel

Refactor the two per-bio completion handlers to share common code using
a new helper.

Signed-off-by: Christoph Hellwig <hch@lst.de>
Reviewed-by: "Darrick J. Wong" <djwong@kernel.org>
---
 fs/iomap/direct-io.c | 42 +++++++++++++++++++-----------------------
 1 file changed, 19 insertions(+), 23 deletions(-)

diff --git a/fs/iomap/direct-io.c b/fs/iomap/direct-io.c
index bb79519dec65..c1d5db85c8c7 100644
--- a/fs/iomap/direct-io.c
+++ b/fs/iomap/direct-io.c
@@ -211,16 +211,20 @@ static void iomap_dio_done(struct iomap_dio *dio)
 	iomap_dio_complete_work(&dio->aio.work);
 }
 
-void iomap_dio_bio_end_io(struct bio *bio)
+static void __iomap_dio_bio_end_io(struct bio *bio, bool inline_completion)
 {
 	struct iomap_dio *dio = bio->bi_private;
 	bool should_dirty = (dio->flags & IOMAP_DIO_DIRTY);
 
-	if (bio->bi_status)
-		iomap_dio_set_error(dio, blk_status_to_errno(bio->bi_status));
-
-	if (atomic_dec_and_test(&dio->ref))
+	if (atomic_dec_and_test(&dio->ref)) {
+		/*
+		 * Avoid another context switch for the completion when already
+		 * called from the ioend completion workqueue.
+		 */
+		if (inline_completion)
+			dio->flags &= ~IOMAP_DIO_COMP_WORK;
 		iomap_dio_done(dio);
+	}
 
 	if (should_dirty) {
 		bio_check_pages_dirty(bio);
@@ -229,33 +233,25 @@ void iomap_dio_bio_end_io(struct bio *bio)
 		bio_put(bio);
 	}
 }
+
+void iomap_dio_bio_end_io(struct bio *bio)
+{
+	struct iomap_dio *dio = bio->bi_private;
+
+	if (bio->bi_status)
+		iomap_dio_set_error(dio, blk_status_to_errno(bio->bi_status));
+	__iomap_dio_bio_end_io(bio, false);
+}
 EXPORT_SYMBOL_GPL(iomap_dio_bio_end_io);
 
 u32 iomap_finish_ioend_direct(struct iomap_ioend *ioend)
 {
 	struct iomap_dio *dio = ioend->io_bio.bi_private;
-	bool should_dirty = (dio->flags & IOMAP_DIO_DIRTY);
 	u32 vec_count = ioend->io_bio.bi_vcnt;
 
 	if (ioend->io_error)
 		iomap_dio_set_error(dio, ioend->io_error);
-
-	if (atomic_dec_and_test(&dio->ref)) {
-		/*
-		 * Try to avoid another context switch for the completion given
-		 * that we are already called from the ioend completion
-		 * workqueue.
-		 */
-		dio->flags &= ~IOMAP_DIO_COMP_WORK;
-		iomap_dio_done(dio);
-	}
-
-	if (should_dirty) {
-		bio_check_pages_dirty(&ioend->io_bio);
-	} else {
-		bio_release_pages(&ioend->io_bio, false);
-		bio_put(&ioend->io_bio);
-	}
+	__iomap_dio_bio_end_io(&ioend->io_bio, true);
 
 	/*
 	 * Return the number of bvecs completed as even direct I/O completions
-- 
2.47.3


^ permalink raw reply related	[flat|nested] 75+ messages in thread

* [PATCH 10/14] iomap: free the bio before completing the dio
  2026-01-19  7:44 ` bounce buffer direct I/O when stable pages are required v2 Christoph Hellwig
                     ` (8 preceding siblings ...)
  2026-01-19  7:44   ` [PATCH 09/14] iomap: share code between iomap_dio_bio_end_io and iomap_finish_ioend_direct Christoph Hellwig
@ 2026-01-19  7:44   ` Christoph Hellwig
  2026-01-19 17:43     ` Darrick J. Wong
  2026-01-23  8:59     ` Damien Le Moal
  2026-01-19  7:44   ` [PATCH 11/14] iomap: rename IOMAP_DIO_DIRTY to IOMAP_DIO_USER_BACKED Christoph Hellwig
                     ` (5 subsequent siblings)
  15 siblings, 2 replies; 75+ messages in thread
From: Christoph Hellwig @ 2026-01-19  7:44 UTC (permalink / raw)
  To: Jens Axboe, Christian Brauner
  Cc: Darrick J. Wong, Carlos Maiolino, Qu Wenruo, Al Viro, linux-block,
	linux-xfs, linux-fsdevel

There are good arguments for processing the user completions ASAP vs.
freeing resources ASAP, but freeing the bio first here removes potential
use after free hazards when checking flags, and will simplify the
upcoming bounce buffer support.

Signed-off-by: Christoph Hellwig <hch@lst.de>
---
 fs/iomap/direct-io.c | 17 +++++++++--------
 1 file changed, 9 insertions(+), 8 deletions(-)

diff --git a/fs/iomap/direct-io.c b/fs/iomap/direct-io.c
index c1d5db85c8c7..d4d52775ce25 100644
--- a/fs/iomap/direct-io.c
+++ b/fs/iomap/direct-io.c
@@ -214,7 +214,15 @@ static void iomap_dio_done(struct iomap_dio *dio)
 static void __iomap_dio_bio_end_io(struct bio *bio, bool inline_completion)
 {
 	struct iomap_dio *dio = bio->bi_private;
-	bool should_dirty = (dio->flags & IOMAP_DIO_DIRTY);
+
+	if (dio->flags & IOMAP_DIO_DIRTY) {
+		bio_check_pages_dirty(bio);
+	} else {
+		bio_release_pages(bio, false);
+		bio_put(bio);
+	}
+
+	/* Do not touch bio below, we just gave up our reference. */
 
 	if (atomic_dec_and_test(&dio->ref)) {
 		/*
@@ -225,13 +233,6 @@ static void __iomap_dio_bio_end_io(struct bio *bio, bool inline_completion)
 			dio->flags &= ~IOMAP_DIO_COMP_WORK;
 		iomap_dio_done(dio);
 	}
-
-	if (should_dirty) {
-		bio_check_pages_dirty(bio);
-	} else {
-		bio_release_pages(bio, false);
-		bio_put(bio);
-	}
 }
 
 void iomap_dio_bio_end_io(struct bio *bio)
-- 
2.47.3


^ permalink raw reply related	[flat|nested] 75+ messages in thread

* [PATCH 11/14] iomap: rename IOMAP_DIO_DIRTY to IOMAP_DIO_USER_BACKED
  2026-01-19  7:44 ` bounce buffer direct I/O when stable pages are required v2 Christoph Hellwig
                     ` (9 preceding siblings ...)
  2026-01-19  7:44   ` [PATCH 10/14] iomap: free the bio before completing the dio Christoph Hellwig
@ 2026-01-19  7:44   ` Christoph Hellwig
  2026-01-23  9:00     ` Damien Le Moal
  2026-01-19  7:44   ` [PATCH 12/14] iomap: support ioends for direct reads Christoph Hellwig
                     ` (4 subsequent siblings)
  15 siblings, 1 reply; 75+ messages in thread
From: Christoph Hellwig @ 2026-01-19  7:44 UTC (permalink / raw)
  To: Jens Axboe, Christian Brauner
  Cc: Darrick J. Wong, Carlos Maiolino, Qu Wenruo, Al Viro, linux-block,
	linux-xfs, linux-fsdevel

Match the more descriptive iov_iter terminology instead of encoding
what we do with them for reads only.

Signed-off-by: Christoph Hellwig <hch@lst.de>
Reviewed-by: "Darrick J. Wong" <djwong@kernel.org>
---
 fs/iomap/direct-io.c | 8 ++++----
 1 file changed, 4 insertions(+), 4 deletions(-)

diff --git a/fs/iomap/direct-io.c b/fs/iomap/direct-io.c
index d4d52775ce25..eca7adda595a 100644
--- a/fs/iomap/direct-io.c
+++ b/fs/iomap/direct-io.c
@@ -22,7 +22,7 @@
 #define IOMAP_DIO_WRITE_THROUGH	(1U << 28)
 #define IOMAP_DIO_NEED_SYNC	(1U << 29)
 #define IOMAP_DIO_WRITE		(1U << 30)
-#define IOMAP_DIO_DIRTY		(1U << 31)
+#define IOMAP_DIO_USER_BACKED	(1U << 31)
 
 struct iomap_dio {
 	struct kiocb		*iocb;
@@ -215,7 +215,7 @@ static void __iomap_dio_bio_end_io(struct bio *bio, bool inline_completion)
 {
 	struct iomap_dio *dio = bio->bi_private;
 
-	if (dio->flags & IOMAP_DIO_DIRTY) {
+	if (dio->flags & IOMAP_DIO_USER_BACKED) {
 		bio_check_pages_dirty(bio);
 	} else {
 		bio_release_pages(bio, false);
@@ -333,7 +333,7 @@ static ssize_t iomap_dio_bio_iter_one(struct iomap_iter *iter,
 
 	if (dio->flags & IOMAP_DIO_WRITE)
 		task_io_account_write(ret);
-	else if (dio->flags & IOMAP_DIO_DIRTY)
+	else if (dio->flags & IOMAP_DIO_USER_BACKED)
 		bio_set_pages_dirty(bio);
 
 	/*
@@ -679,7 +679,7 @@ __iomap_dio_rw(struct kiocb *iocb, struct iov_iter *iter,
 			goto out_free_dio;
 
 		if (user_backed_iter(iter))
-			dio->flags |= IOMAP_DIO_DIRTY;
+			dio->flags |= IOMAP_DIO_USER_BACKED;
 
 		ret = kiocb_write_and_wait(iocb, iomi.len);
 		if (ret)
-- 
2.47.3


^ permalink raw reply related	[flat|nested] 75+ messages in thread

* [PATCH 12/14] iomap: support ioends for direct reads
  2026-01-19  7:44 ` bounce buffer direct I/O when stable pages are required v2 Christoph Hellwig
                     ` (10 preceding siblings ...)
  2026-01-19  7:44   ` [PATCH 11/14] iomap: rename IOMAP_DIO_DIRTY to IOMAP_DIO_USER_BACKED Christoph Hellwig
@ 2026-01-19  7:44   ` Christoph Hellwig
  2026-01-23  9:02     ` Damien Le Moal
  2026-01-19  7:44   ` [PATCH 13/14] iomap: add a flag to bounce buffer direct I/O Christoph Hellwig
                     ` (3 subsequent siblings)
  15 siblings, 1 reply; 75+ messages in thread
From: Christoph Hellwig @ 2026-01-19  7:44 UTC (permalink / raw)
  To: Jens Axboe, Christian Brauner
  Cc: Darrick J. Wong, Carlos Maiolino, Qu Wenruo, Al Viro, linux-block,
	linux-xfs, linux-fsdevel

Support using the ioend structure to defer I/O completion for direct
reads in addition to writes.  This requires a check for the operation
to not merge reads and writes in iomap_ioend_can_merge.  This support
will be used for bounce buffered direct I/O reads that need to copy
data back to the user address space on read completion.

Signed-off-by: Christoph Hellwig <hch@lst.de>
Reviewed-by: "Darrick J. Wong" <djwong@kernel.org>
---
 fs/iomap/ioend.c | 8 ++++++++
 1 file changed, 8 insertions(+)

diff --git a/fs/iomap/ioend.c b/fs/iomap/ioend.c
index 86f44922ed3b..800d12f45438 100644
--- a/fs/iomap/ioend.c
+++ b/fs/iomap/ioend.c
@@ -299,6 +299,14 @@ EXPORT_SYMBOL_GPL(iomap_finish_ioends);
 static bool iomap_ioend_can_merge(struct iomap_ioend *ioend,
 		struct iomap_ioend *next)
 {
+	/*
+	 * There is no point in merging reads as there is no completion
+	 * processing that can be easily batched up for them.
+	 */
+	if (bio_op(&ioend->io_bio) == REQ_OP_READ ||
+	    bio_op(&next->io_bio) == REQ_OP_READ)
+		return false;
+
 	if (ioend->io_bio.bi_status != next->io_bio.bi_status)
 		return false;
 	if (next->io_flags & IOMAP_IOEND_BOUNDARY)
-- 
2.47.3


^ permalink raw reply related	[flat|nested] 75+ messages in thread

* [PATCH 13/14] iomap: add a flag to bounce buffer direct I/O
  2026-01-19  7:44 ` bounce buffer direct I/O when stable pages are required v2 Christoph Hellwig
                     ` (11 preceding siblings ...)
  2026-01-19  7:44   ` [PATCH 12/14] iomap: support ioends for direct reads Christoph Hellwig
@ 2026-01-19  7:44   ` Christoph Hellwig
  2026-01-23  9:05     ` Damien Le Moal
  2026-01-19  7:44   ` [PATCH 14/14] xfs: use bounce buffering direct I/O when the device requires stable pages Christoph Hellwig
                     ` (2 subsequent siblings)
  15 siblings, 1 reply; 75+ messages in thread
From: Christoph Hellwig @ 2026-01-19  7:44 UTC (permalink / raw)
  To: Jens Axboe, Christian Brauner
  Cc: Darrick J. Wong, Carlos Maiolino, Qu Wenruo, Al Viro, linux-block,
	linux-xfs, linux-fsdevel

Add a new flag that request bounce buffering for direct I/O.  This is
needed to provide the stable pages requirement requested by devices
that need to calculate checksums or parity over the data and allows
file systems to properly work with things like T10 protection
information.  The implementation just calls out to the new bio bounce
buffering helpers to allocate a bounce buffer, which is used for
I/O and to copy to/from it.

Signed-off-by: Christoph Hellwig <hch@lst.de>
Reviewed-by: "Darrick J. Wong" <djwong@kernel.org>
---
 fs/iomap/direct-io.c  | 30 ++++++++++++++++++++----------
 include/linux/iomap.h |  9 +++++++++
 2 files changed, 29 insertions(+), 10 deletions(-)

diff --git a/fs/iomap/direct-io.c b/fs/iomap/direct-io.c
index eca7adda595a..9c572de0d596 100644
--- a/fs/iomap/direct-io.c
+++ b/fs/iomap/direct-io.c
@@ -215,7 +215,11 @@ static void __iomap_dio_bio_end_io(struct bio *bio, bool inline_completion)
 {
 	struct iomap_dio *dio = bio->bi_private;
 
-	if (dio->flags & IOMAP_DIO_USER_BACKED) {
+	if (dio->flags & IOMAP_DIO_BOUNCE) {
+		bio_iov_iter_unbounce(bio, !!dio->error,
+				dio->flags & IOMAP_DIO_USER_BACKED);
+		bio_put(bio);
+	} else if (dio->flags & IOMAP_DIO_USER_BACKED) {
 		bio_check_pages_dirty(bio);
 	} else {
 		bio_release_pages(bio, false);
@@ -303,12 +307,16 @@ static ssize_t iomap_dio_bio_iter_one(struct iomap_iter *iter,
 		struct iomap_dio *dio, loff_t pos, unsigned int alignment,
 		blk_opf_t op)
 {
+	unsigned int nr_vecs;
 	struct bio *bio;
 	ssize_t ret;
 
-	bio = iomap_dio_alloc_bio(iter, dio,
-			bio_iov_vecs_to_alloc(dio->submit.iter, BIO_MAX_VECS),
-			op);
+	if (dio->flags & IOMAP_DIO_BOUNCE)
+		nr_vecs = bio_iov_bounce_nr_vecs(dio->submit.iter, op);
+	else
+		nr_vecs = bio_iov_vecs_to_alloc(dio->submit.iter, BIO_MAX_VECS);
+
+	bio = iomap_dio_alloc_bio(iter, dio, nr_vecs, op);
 	fscrypt_set_bio_crypt_ctx(bio, iter->inode,
 			pos >> iter->inode->i_blkbits, GFP_KERNEL);
 	bio->bi_iter.bi_sector = iomap_sector(&iter->iomap, pos);
@@ -317,7 +325,11 @@ static ssize_t iomap_dio_bio_iter_one(struct iomap_iter *iter,
 	bio->bi_private = dio;
 	bio->bi_end_io = iomap_dio_bio_end_io;
 
-	ret = bio_iov_iter_get_pages(bio, dio->submit.iter, alignment - 1);
+	if (dio->flags & IOMAP_DIO_BOUNCE)
+		ret = bio_iov_iter_bounce(bio, dio->submit.iter);
+	else
+		ret = bio_iov_iter_get_pages(bio, dio->submit.iter,
+					     alignment - 1);
 	if (unlikely(ret))
 		goto out_put_bio;
 	ret = bio->bi_iter.bi_size;
@@ -333,7 +345,8 @@ static ssize_t iomap_dio_bio_iter_one(struct iomap_iter *iter,
 
 	if (dio->flags & IOMAP_DIO_WRITE)
 		task_io_account_write(ret);
-	else if (dio->flags & IOMAP_DIO_USER_BACKED)
+	else if ((dio->flags & IOMAP_DIO_USER_BACKED) &&
+		 !(dio->flags & IOMAP_DIO_BOUNCE))
 		bio_set_pages_dirty(bio);
 
 	/*
@@ -662,7 +675,7 @@ __iomap_dio_rw(struct kiocb *iocb, struct iov_iter *iter,
 	dio->i_size = i_size_read(inode);
 	dio->dops = dops;
 	dio->error = 0;
-	dio->flags = 0;
+	dio->flags = dio_flags & (IOMAP_DIO_FSBLOCK_ALIGNED | IOMAP_DIO_BOUNCE);
 	dio->done_before = done_before;
 
 	dio->submit.iter = iter;
@@ -671,9 +684,6 @@ __iomap_dio_rw(struct kiocb *iocb, struct iov_iter *iter,
 	if (iocb->ki_flags & IOCB_NOWAIT)
 		iomi.flags |= IOMAP_NOWAIT;
 
-	if (dio_flags & IOMAP_DIO_FSBLOCK_ALIGNED)
-		dio->flags |= IOMAP_DIO_FSBLOCK_ALIGNED;
-
 	if (iov_iter_rw(iter) == READ) {
 		if (iomi.pos >= dio->i_size)
 			goto out_free_dio;
diff --git a/include/linux/iomap.h b/include/linux/iomap.h
index 520e967cb501..cf152f638665 100644
--- a/include/linux/iomap.h
+++ b/include/linux/iomap.h
@@ -562,6 +562,15 @@ struct iomap_dio_ops {
  */
 #define IOMAP_DIO_FSBLOCK_ALIGNED	(1 << 3)
 
+/*
+ * Bounce buffer instead of using zero copy access.
+ *
+ * This is needed if the device needs stable data to checksum or generate
+ * parity.  The file system must hook into the I/O submission and offload
+ * completions to user context for reads when this is set.
+ */
+#define IOMAP_DIO_BOUNCE		(1 << 4)
+
 ssize_t iomap_dio_rw(struct kiocb *iocb, struct iov_iter *iter,
 		const struct iomap_ops *ops, const struct iomap_dio_ops *dops,
 		unsigned int dio_flags, void *private, size_t done_before);
-- 
2.47.3


^ permalink raw reply related	[flat|nested] 75+ messages in thread

* [PATCH 14/14] xfs: use bounce buffering direct I/O when the device requires stable pages
  2026-01-19  7:44 ` bounce buffer direct I/O when stable pages are required v2 Christoph Hellwig
                     ` (12 preceding siblings ...)
  2026-01-19  7:44   ` [PATCH 13/14] iomap: add a flag to bounce buffer direct I/O Christoph Hellwig
@ 2026-01-19  7:44   ` Christoph Hellwig
  2026-01-19 17:45     ` Darrick J. Wong
  2026-01-23  9:08     ` Damien Le Moal
  2026-01-23 12:10   ` bounce buffer direct I/O when stable pages are required v2 Anuj Gupta
  2026-01-23 12:24   ` Christian Brauner
  15 siblings, 2 replies; 75+ messages in thread
From: Christoph Hellwig @ 2026-01-19  7:44 UTC (permalink / raw)
  To: Jens Axboe, Christian Brauner
  Cc: Darrick J. Wong, Carlos Maiolino, Qu Wenruo, Al Viro, linux-block,
	linux-xfs, linux-fsdevel

Fix direct I/O on devices that require stable pages by asking iomap
to bounce buffer.  To support this, ioends are used for direct reads
in this case to provide a user context for copying data back from the
bounce buffer.

This fixes qemu when used on devices using T10 protection information
and probably other cases like iSCSI using data digests.

Signed-off-by: Christoph Hellwig <hch@lst.de>
---
 fs/xfs/xfs_aops.c |  8 ++++++--
 fs/xfs/xfs_file.c | 41 ++++++++++++++++++++++++++++++++++++++---
 2 files changed, 44 insertions(+), 5 deletions(-)

diff --git a/fs/xfs/xfs_aops.c b/fs/xfs/xfs_aops.c
index 56a544638491..c3c1e149fff4 100644
--- a/fs/xfs/xfs_aops.c
+++ b/fs/xfs/xfs_aops.c
@@ -103,7 +103,7 @@ xfs_ioend_put_open_zones(
  * IO write completion.
  */
 STATIC void
-xfs_end_ioend(
+xfs_end_ioend_write(
 	struct iomap_ioend	*ioend)
 {
 	struct xfs_inode	*ip = XFS_I(ioend->io_inode);
@@ -202,7 +202,11 @@ xfs_end_io(
 			io_list))) {
 		list_del_init(&ioend->io_list);
 		iomap_ioend_try_merge(ioend, &tmp);
-		xfs_end_ioend(ioend);
+		if (bio_op(&ioend->io_bio) == REQ_OP_READ)
+			iomap_finish_ioends(ioend,
+				blk_status_to_errno(ioend->io_bio.bi_status));
+		else
+			xfs_end_ioend_write(ioend);
 		cond_resched();
 	}
 }
diff --git a/fs/xfs/xfs_file.c b/fs/xfs/xfs_file.c
index 7874cf745af3..f6cc63dcf961 100644
--- a/fs/xfs/xfs_file.c
+++ b/fs/xfs/xfs_file.c
@@ -224,12 +224,34 @@ xfs_ilock_iocb_for_write(
 	return 0;
 }
 
+/*
+ * Bounce buffering dio reads need a user context to copy back the data.
+ * Use an ioend to provide that.
+ */
+static void
+xfs_dio_read_bounce_submit_io(
+	const struct iomap_iter	*iter,
+	struct bio		*bio,
+	loff_t			file_offset)
+{
+	iomap_init_ioend(iter->inode, bio, file_offset, IOMAP_IOEND_DIRECT);
+	bio->bi_end_io = xfs_end_bio;
+	submit_bio(bio);
+}
+
+static const struct iomap_dio_ops xfs_dio_read_bounce_ops = {
+	.submit_io	= xfs_dio_read_bounce_submit_io,
+	.bio_set	= &iomap_ioend_bioset,
+};
+
 STATIC ssize_t
 xfs_file_dio_read(
 	struct kiocb		*iocb,
 	struct iov_iter		*to)
 {
 	struct xfs_inode	*ip = XFS_I(file_inode(iocb->ki_filp));
+	unsigned int		dio_flags = 0;
+	const struct iomap_dio_ops *dio_ops = NULL;
 	ssize_t			ret;
 
 	trace_xfs_file_direct_read(iocb, to);
@@ -242,7 +264,12 @@ xfs_file_dio_read(
 	ret = xfs_ilock_iocb(iocb, XFS_IOLOCK_SHARED);
 	if (ret)
 		return ret;
-	ret = iomap_dio_rw(iocb, to, &xfs_read_iomap_ops, NULL, 0, NULL, 0);
+	if (mapping_stable_writes(iocb->ki_filp->f_mapping)) {
+		dio_ops = &xfs_dio_read_bounce_ops;
+		dio_flags |= IOMAP_DIO_BOUNCE;
+	}
+	ret = iomap_dio_rw(iocb, to, &xfs_read_iomap_ops, dio_ops, dio_flags,
+			NULL, 0);
 	xfs_iunlock(ip, XFS_IOLOCK_SHARED);
 
 	return ret;
@@ -703,6 +730,8 @@ xfs_file_dio_write_aligned(
 		xfs_ilock_demote(ip, XFS_IOLOCK_EXCL);
 		iolock = XFS_IOLOCK_SHARED;
 	}
+	if (mapping_stable_writes(iocb->ki_filp->f_mapping))
+		dio_flags |= IOMAP_DIO_BOUNCE;
 	trace_xfs_file_direct_write(iocb, from);
 	ret = iomap_dio_rw(iocb, from, ops, dops, dio_flags, ac, 0);
 out_unlock:
@@ -750,6 +779,7 @@ xfs_file_dio_write_atomic(
 {
 	unsigned int		iolock = XFS_IOLOCK_SHARED;
 	ssize_t			ret, ocount = iov_iter_count(from);
+	unsigned int		dio_flags = 0;
 	const struct iomap_ops	*dops;
 
 	/*
@@ -777,8 +807,10 @@ xfs_file_dio_write_atomic(
 	}
 
 	trace_xfs_file_direct_write(iocb, from);
-	ret = iomap_dio_rw(iocb, from, dops, &xfs_dio_write_ops,
-			0, NULL, 0);
+	if (mapping_stable_writes(iocb->ki_filp->f_mapping))
+		dio_flags |= IOMAP_DIO_BOUNCE;
+	ret = iomap_dio_rw(iocb, from, dops, &xfs_dio_write_ops, dio_flags,
+			NULL, 0);
 
 	/*
 	 * The retry mechanism is based on the ->iomap_begin method returning
@@ -867,6 +899,9 @@ xfs_file_dio_write_unaligned(
 	if (flags & IOMAP_DIO_FORCE_WAIT)
 		inode_dio_wait(VFS_I(ip));
 
+	if (mapping_stable_writes(iocb->ki_filp->f_mapping))
+		flags |= IOMAP_DIO_BOUNCE;
+
 	trace_xfs_file_direct_write(iocb, from);
 	ret = iomap_dio_rw(iocb, from, &xfs_direct_write_iomap_ops,
 			   &xfs_dio_write_ops, flags, NULL, 0);
-- 
2.47.3


^ permalink raw reply related	[flat|nested] 75+ messages in thread

* Re: [PATCH 06/14] iomap: fix submission side handling of completion side errors
  2026-01-19  7:44   ` [PATCH 06/14] iomap: fix submission side handling of completion side errors Christoph Hellwig
@ 2026-01-19 17:40     ` Darrick J. Wong
  2026-01-23  8:54     ` Damien Le Moal
  1 sibling, 0 replies; 75+ messages in thread
From: Darrick J. Wong @ 2026-01-19 17:40 UTC (permalink / raw)
  To: Christoph Hellwig
  Cc: Jens Axboe, Christian Brauner, Carlos Maiolino, Qu Wenruo,
	Al Viro, linux-block, linux-xfs, linux-fsdevel

On Mon, Jan 19, 2026 at 08:44:13AM +0100, Christoph Hellwig wrote:
> The "if (dio->error)" in iomap_dio_bio_iter exists to stop submitting
> more bios when a completion already return an error.  Commit cfe057f7db1f
> ("iomap_dio_actor(): fix iov_iter bugs") made it revert the iov by
> "copied", which is very wrong given that we've already consumed that
> range and submitted a bio for it.
> 
> Fixes: cfe057f7db1f ("iomap_dio_actor(): fix iov_iter bugs")
> Signed-off-by: Christoph Hellwig <hch@lst.de>

Thanks for answering my question last time around,
Reviewed-by: "Darrick J. Wong" <djwong@kernel.org>

--D

> ---
>  fs/iomap/direct-io.c | 10 +++++++---
>  1 file changed, 7 insertions(+), 3 deletions(-)
> 
> diff --git a/fs/iomap/direct-io.c b/fs/iomap/direct-io.c
> index 4000c8596d9b..867c0ac6df8f 100644
> --- a/fs/iomap/direct-io.c
> +++ b/fs/iomap/direct-io.c
> @@ -443,9 +443,13 @@ static int iomap_dio_bio_iter(struct iomap_iter *iter, struct iomap_dio *dio)
>  	nr_pages = bio_iov_vecs_to_alloc(dio->submit.iter, BIO_MAX_VECS);
>  	do {
>  		size_t n;
> -		if (dio->error) {
> -			iov_iter_revert(dio->submit.iter, copied);
> -			copied = ret = 0;
> +
> +		/*
> +		 * If completions already occurred and reported errors, give up now and
> +		 * don't bother submitting more bios.
> +		 */
> +		if (unlikely(data_race(dio->error))) {
> +			ret = 0;
>  			goto out;
>  		}
>  
> -- 
> 2.47.3
> 
> 

^ permalink raw reply	[flat|nested] 75+ messages in thread

* Re: [PATCH 07/14] iomap: simplify iomap_dio_bio_iter
  2026-01-19  7:44   ` [PATCH 07/14] iomap: simplify iomap_dio_bio_iter Christoph Hellwig
@ 2026-01-19 17:43     ` Darrick J. Wong
  2026-01-23  8:55     ` Damien Le Moal
  1 sibling, 0 replies; 75+ messages in thread
From: Darrick J. Wong @ 2026-01-19 17:43 UTC (permalink / raw)
  To: Christoph Hellwig
  Cc: Jens Axboe, Christian Brauner, Carlos Maiolino, Qu Wenruo,
	Al Viro, linux-block, linux-xfs, linux-fsdevel

On Mon, Jan 19, 2026 at 08:44:14AM +0100, Christoph Hellwig wrote:
> Use iov_iter_count to check if we need to continue as that just reads
> a field in the iov_iter, and only use bio_iov_vecs_to_alloc to calculate
> the actual number of vectors to allocate for the bio.
> 
> Signed-off-by: Christoph Hellwig <hch@lst.de>

/me is satisfied that bio_iov_vecs_to_alloc -> iov_iter_count is a
reasonable enough substitution.

Reviewed-by: "Darrick J. Wong" <djwong@kernel.org>

--D

> ---
>  fs/iomap/direct-io.c | 13 ++++++-------
>  1 file changed, 6 insertions(+), 7 deletions(-)
> 
> diff --git a/fs/iomap/direct-io.c b/fs/iomap/direct-io.c
> index 867c0ac6df8f..de03bc7cf4ed 100644
> --- a/fs/iomap/direct-io.c
> +++ b/fs/iomap/direct-io.c
> @@ -312,7 +312,7 @@ static int iomap_dio_bio_iter(struct iomap_iter *iter, struct iomap_dio *dio)
>  	blk_opf_t bio_opf = REQ_SYNC | REQ_IDLE;
>  	struct bio *bio;
>  	bool need_zeroout = false;
> -	int nr_pages, ret = 0;
> +	int ret = 0;
>  	u64 copied = 0;
>  	size_t orig_count;
>  	unsigned int alignment;
> @@ -440,7 +440,6 @@ static int iomap_dio_bio_iter(struct iomap_iter *iter, struct iomap_dio *dio)
>  			goto out;
>  	}
>  
> -	nr_pages = bio_iov_vecs_to_alloc(dio->submit.iter, BIO_MAX_VECS);
>  	do {
>  		size_t n;
>  
> @@ -453,7 +452,9 @@ static int iomap_dio_bio_iter(struct iomap_iter *iter, struct iomap_dio *dio)
>  			goto out;
>  		}
>  
> -		bio = iomap_dio_alloc_bio(iter, dio, nr_pages, bio_opf);
> +		bio = iomap_dio_alloc_bio(iter, dio,
> +				bio_iov_vecs_to_alloc(dio->submit.iter,
> +						BIO_MAX_VECS), bio_opf);
>  		fscrypt_set_bio_crypt_ctx(bio, inode, pos >> inode->i_blkbits,
>  					  GFP_KERNEL);
>  		bio->bi_iter.bi_sector = iomap_sector(iomap, pos);
> @@ -495,16 +496,14 @@ static int iomap_dio_bio_iter(struct iomap_iter *iter, struct iomap_dio *dio)
>  		dio->size += n;
>  		copied += n;
>  
> -		nr_pages = bio_iov_vecs_to_alloc(dio->submit.iter,
> -						 BIO_MAX_VECS);
>  		/*
>  		 * We can only poll for single bio I/Os.
>  		 */
> -		if (nr_pages)
> +		if (iov_iter_count(dio->submit.iter))
>  			dio->iocb->ki_flags &= ~IOCB_HIPRI;
>  		iomap_dio_submit_bio(iter, dio, bio, pos);
>  		pos += n;
> -	} while (nr_pages);
> +	} while (iov_iter_count(dio->submit.iter));
>  
>  	/*
>  	 * We need to zeroout the tail of a sub-block write if the extent type
> -- 
> 2.47.3
> 
> 

^ permalink raw reply	[flat|nested] 75+ messages in thread

* Re: [PATCH 10/14] iomap: free the bio before completing the dio
  2026-01-19  7:44   ` [PATCH 10/14] iomap: free the bio before completing the dio Christoph Hellwig
@ 2026-01-19 17:43     ` Darrick J. Wong
  2026-01-23  8:59     ` Damien Le Moal
  1 sibling, 0 replies; 75+ messages in thread
From: Darrick J. Wong @ 2026-01-19 17:43 UTC (permalink / raw)
  To: Christoph Hellwig
  Cc: Jens Axboe, Christian Brauner, Carlos Maiolino, Qu Wenruo,
	Al Viro, linux-block, linux-xfs, linux-fsdevel

On Mon, Jan 19, 2026 at 08:44:17AM +0100, Christoph Hellwig wrote:
> There are good arguments for processing the user completions ASAP vs.
> freeing resources ASAP, but freeing the bio first here removes potential
> use after free hazards when checking flags, and will simplify the
> upcoming bounce buffer support.
> 
> Signed-off-by: Christoph Hellwig <hch@lst.de>
> ---
>  fs/iomap/direct-io.c | 17 +++++++++--------
>  1 file changed, 9 insertions(+), 8 deletions(-)
> 
> diff --git a/fs/iomap/direct-io.c b/fs/iomap/direct-io.c
> index c1d5db85c8c7..d4d52775ce25 100644
> --- a/fs/iomap/direct-io.c
> +++ b/fs/iomap/direct-io.c
> @@ -214,7 +214,15 @@ static void iomap_dio_done(struct iomap_dio *dio)
>  static void __iomap_dio_bio_end_io(struct bio *bio, bool inline_completion)
>  {
>  	struct iomap_dio *dio = bio->bi_private;
> -	bool should_dirty = (dio->flags & IOMAP_DIO_DIRTY);
> +
> +	if (dio->flags & IOMAP_DIO_DIRTY) {
> +		bio_check_pages_dirty(bio);
> +	} else {
> +		bio_release_pages(bio, false);
> +		bio_put(bio);
> +	}
> +
> +	/* Do not touch bio below, we just gave up our reference. */

Thanks for adding this!
Reviewed-by: "Darrick J. Wong" <djwong@kernel.org>

--D

>  
>  	if (atomic_dec_and_test(&dio->ref)) {
>  		/*
> @@ -225,13 +233,6 @@ static void __iomap_dio_bio_end_io(struct bio *bio, bool inline_completion)
>  			dio->flags &= ~IOMAP_DIO_COMP_WORK;
>  		iomap_dio_done(dio);
>  	}
> -
> -	if (should_dirty) {
> -		bio_check_pages_dirty(bio);
> -	} else {
> -		bio_release_pages(bio, false);
> -		bio_put(bio);
> -	}
>  }
>  
>  void iomap_dio_bio_end_io(struct bio *bio)
> -- 
> 2.47.3
> 
> 

^ permalink raw reply	[flat|nested] 75+ messages in thread

* Re: [PATCH 14/14] xfs: use bounce buffering direct I/O when the device requires stable pages
  2026-01-19  7:44   ` [PATCH 14/14] xfs: use bounce buffering direct I/O when the device requires stable pages Christoph Hellwig
@ 2026-01-19 17:45     ` Darrick J. Wong
  2026-01-23  9:08     ` Damien Le Moal
  1 sibling, 0 replies; 75+ messages in thread
From: Darrick J. Wong @ 2026-01-19 17:45 UTC (permalink / raw)
  To: Christoph Hellwig
  Cc: Jens Axboe, Christian Brauner, Carlos Maiolino, Qu Wenruo,
	Al Viro, linux-block, linux-xfs, linux-fsdevel

On Mon, Jan 19, 2026 at 08:44:21AM +0100, Christoph Hellwig wrote:
> Fix direct I/O on devices that require stable pages by asking iomap
> to bounce buffer.  To support this, ioends are used for direct reads
> in this case to provide a user context for copying data back from the
> bounce buffer.
> 
> This fixes qemu when used on devices using T10 protection information
> and probably other cases like iSCSI using data digests.
> 
> Signed-off-by: Christoph Hellwig <hch@lst.de>

Ahaha, I forgot in the last round that s_dio_done_wq is not at all the
place for doing bio completions.
Reviewed-by: "Darrick J. Wong" <djwong@kernel.org>

--D

> ---
>  fs/xfs/xfs_aops.c |  8 ++++++--
>  fs/xfs/xfs_file.c | 41 ++++++++++++++++++++++++++++++++++++++---
>  2 files changed, 44 insertions(+), 5 deletions(-)
> 
> diff --git a/fs/xfs/xfs_aops.c b/fs/xfs/xfs_aops.c
> index 56a544638491..c3c1e149fff4 100644
> --- a/fs/xfs/xfs_aops.c
> +++ b/fs/xfs/xfs_aops.c
> @@ -103,7 +103,7 @@ xfs_ioend_put_open_zones(
>   * IO write completion.
>   */
>  STATIC void
> -xfs_end_ioend(
> +xfs_end_ioend_write(
>  	struct iomap_ioend	*ioend)
>  {
>  	struct xfs_inode	*ip = XFS_I(ioend->io_inode);
> @@ -202,7 +202,11 @@ xfs_end_io(
>  			io_list))) {
>  		list_del_init(&ioend->io_list);
>  		iomap_ioend_try_merge(ioend, &tmp);
> -		xfs_end_ioend(ioend);
> +		if (bio_op(&ioend->io_bio) == REQ_OP_READ)
> +			iomap_finish_ioends(ioend,
> +				blk_status_to_errno(ioend->io_bio.bi_status));
> +		else
> +			xfs_end_ioend_write(ioend);
>  		cond_resched();
>  	}
>  }
> diff --git a/fs/xfs/xfs_file.c b/fs/xfs/xfs_file.c
> index 7874cf745af3..f6cc63dcf961 100644
> --- a/fs/xfs/xfs_file.c
> +++ b/fs/xfs/xfs_file.c
> @@ -224,12 +224,34 @@ xfs_ilock_iocb_for_write(
>  	return 0;
>  }
>  
> +/*
> + * Bounce buffering dio reads need a user context to copy back the data.
> + * Use an ioend to provide that.
> + */
> +static void
> +xfs_dio_read_bounce_submit_io(
> +	const struct iomap_iter	*iter,
> +	struct bio		*bio,
> +	loff_t			file_offset)
> +{
> +	iomap_init_ioend(iter->inode, bio, file_offset, IOMAP_IOEND_DIRECT);
> +	bio->bi_end_io = xfs_end_bio;
> +	submit_bio(bio);
> +}
> +
> +static const struct iomap_dio_ops xfs_dio_read_bounce_ops = {
> +	.submit_io	= xfs_dio_read_bounce_submit_io,
> +	.bio_set	= &iomap_ioend_bioset,
> +};
> +
>  STATIC ssize_t
>  xfs_file_dio_read(
>  	struct kiocb		*iocb,
>  	struct iov_iter		*to)
>  {
>  	struct xfs_inode	*ip = XFS_I(file_inode(iocb->ki_filp));
> +	unsigned int		dio_flags = 0;
> +	const struct iomap_dio_ops *dio_ops = NULL;
>  	ssize_t			ret;
>  
>  	trace_xfs_file_direct_read(iocb, to);
> @@ -242,7 +264,12 @@ xfs_file_dio_read(
>  	ret = xfs_ilock_iocb(iocb, XFS_IOLOCK_SHARED);
>  	if (ret)
>  		return ret;
> -	ret = iomap_dio_rw(iocb, to, &xfs_read_iomap_ops, NULL, 0, NULL, 0);
> +	if (mapping_stable_writes(iocb->ki_filp->f_mapping)) {
> +		dio_ops = &xfs_dio_read_bounce_ops;
> +		dio_flags |= IOMAP_DIO_BOUNCE;
> +	}
> +	ret = iomap_dio_rw(iocb, to, &xfs_read_iomap_ops, dio_ops, dio_flags,
> +			NULL, 0);
>  	xfs_iunlock(ip, XFS_IOLOCK_SHARED);
>  
>  	return ret;
> @@ -703,6 +730,8 @@ xfs_file_dio_write_aligned(
>  		xfs_ilock_demote(ip, XFS_IOLOCK_EXCL);
>  		iolock = XFS_IOLOCK_SHARED;
>  	}
> +	if (mapping_stable_writes(iocb->ki_filp->f_mapping))
> +		dio_flags |= IOMAP_DIO_BOUNCE;
>  	trace_xfs_file_direct_write(iocb, from);
>  	ret = iomap_dio_rw(iocb, from, ops, dops, dio_flags, ac, 0);
>  out_unlock:
> @@ -750,6 +779,7 @@ xfs_file_dio_write_atomic(
>  {
>  	unsigned int		iolock = XFS_IOLOCK_SHARED;
>  	ssize_t			ret, ocount = iov_iter_count(from);
> +	unsigned int		dio_flags = 0;
>  	const struct iomap_ops	*dops;
>  
>  	/*
> @@ -777,8 +807,10 @@ xfs_file_dio_write_atomic(
>  	}
>  
>  	trace_xfs_file_direct_write(iocb, from);
> -	ret = iomap_dio_rw(iocb, from, dops, &xfs_dio_write_ops,
> -			0, NULL, 0);
> +	if (mapping_stable_writes(iocb->ki_filp->f_mapping))
> +		dio_flags |= IOMAP_DIO_BOUNCE;
> +	ret = iomap_dio_rw(iocb, from, dops, &xfs_dio_write_ops, dio_flags,
> +			NULL, 0);
>  
>  	/*
>  	 * The retry mechanism is based on the ->iomap_begin method returning
> @@ -867,6 +899,9 @@ xfs_file_dio_write_unaligned(
>  	if (flags & IOMAP_DIO_FORCE_WAIT)
>  		inode_dio_wait(VFS_I(ip));
>  
> +	if (mapping_stable_writes(iocb->ki_filp->f_mapping))
> +		flags |= IOMAP_DIO_BOUNCE;
> +
>  	trace_xfs_file_direct_write(iocb, from);
>  	ret = iomap_dio_rw(iocb, from, &xfs_direct_write_iomap_ops,
>  			   &xfs_dio_write_ops, flags, NULL, 0);
> -- 
> 2.47.3
> 
> 

^ permalink raw reply	[flat|nested] 75+ messages in thread

* Re: [PATCH 01/14] block: refactor get_contig_folio_len
  2026-01-19  7:44   ` [PATCH 01/14] block: refactor get_contig_folio_len Christoph Hellwig
@ 2026-01-22 11:00     ` Johannes Thumshirn
  2026-01-22 17:54     ` Darrick J. Wong
                       ` (3 subsequent siblings)
  4 siblings, 0 replies; 75+ messages in thread
From: Johannes Thumshirn @ 2026-01-22 11:00 UTC (permalink / raw)
  To: hch, Jens Axboe, Christian Brauner
  Cc: Darrick J. Wong, Carlos Maiolino, WenRuo Qu, Al Viro,
	linux-block@vger.kernel.org, linux-xfs@vger.kernel.org,
	linux-fsdevel@vger.kernel.org

Reviewed-by: Johannes Thumshirn <johannes.thumshirn@wdc.com>

^ permalink raw reply	[flat|nested] 75+ messages in thread

* Re: [PATCH 02/14] block: open code bio_add_page and fix handling of mismatching P2P ranges
  2026-01-19  7:44   ` [PATCH 02/14] block: open code bio_add_page and fix handling of mismatching P2P ranges Christoph Hellwig
@ 2026-01-22 11:04     ` Johannes Thumshirn
  2026-01-22 17:59     ` Darrick J. Wong
                       ` (2 subsequent siblings)
  3 siblings, 0 replies; 75+ messages in thread
From: Johannes Thumshirn @ 2026-01-22 11:04 UTC (permalink / raw)
  To: hch, Jens Axboe, Christian Brauner
  Cc: Darrick J. Wong, Carlos Maiolino, WenRuo Qu, Al Viro,
	linux-block@vger.kernel.org, linux-xfs@vger.kernel.org,
	linux-fsdevel@vger.kernel.org

Looks good,

Reviewed-by: Johannes Thumshirn <johannes.thumshirn@wdc.com>


^ permalink raw reply	[flat|nested] 75+ messages in thread

* Re: [PATCH 04/14] block: remove bio_release_page
  2026-01-19  7:44   ` [PATCH 04/14] block: remove bio_release_page Christoph Hellwig
@ 2026-01-22 11:14     ` Johannes Thumshirn
  2026-01-22 17:26     ` Darrick J. Wong
                       ` (2 subsequent siblings)
  3 siblings, 0 replies; 75+ messages in thread
From: Johannes Thumshirn @ 2026-01-22 11:14 UTC (permalink / raw)
  To: hch, Jens Axboe, Christian Brauner
  Cc: Darrick J. Wong, Carlos Maiolino, WenRuo Qu, Al Viro,
	linux-block@vger.kernel.org, linux-xfs@vger.kernel.org,
	linux-fsdevel@vger.kernel.org

Looks good,

Reviewed-by: Johannes Thumshirn <johannes.thumshirn@wdc.com>


^ permalink raw reply	[flat|nested] 75+ messages in thread

* Re: [PATCH 05/14] block: add helpers to bounce buffer an iov_iter into bios
  2026-01-19  7:44   ` [PATCH 05/14] block: add helpers to bounce buffer an iov_iter into bios Christoph Hellwig
@ 2026-01-22 13:05     ` Johannes Thumshirn
  2026-01-22 17:25     ` Darrick J. Wong
                       ` (2 subsequent siblings)
  3 siblings, 0 replies; 75+ messages in thread
From: Johannes Thumshirn @ 2026-01-22 13:05 UTC (permalink / raw)
  To: hch, Jens Axboe, Christian Brauner
  Cc: Darrick J. Wong, Carlos Maiolino, WenRuo Qu, Al Viro,
	linux-block@vger.kernel.org, linux-xfs@vger.kernel.org,
	linux-fsdevel@vger.kernel.org

 From what I can see this looks good,

Reviewed-by: Johannes Thumshirn <johannes.thumshirn@wdc.com>


^ permalink raw reply	[flat|nested] 75+ messages in thread

* Re: [PATCH 05/14] block: add helpers to bounce buffer an iov_iter into bios
  2026-01-19  7:44   ` [PATCH 05/14] block: add helpers to bounce buffer an iov_iter into bios Christoph Hellwig
  2026-01-22 13:05     ` Johannes Thumshirn
@ 2026-01-22 17:25     ` Darrick J. Wong
  2026-01-23  5:51       ` Christoph Hellwig
  2026-01-23  8:52     ` Damien Le Moal
  2026-01-23 12:20     ` Anuj Gupta
  3 siblings, 1 reply; 75+ messages in thread
From: Darrick J. Wong @ 2026-01-22 17:25 UTC (permalink / raw)
  To: Christoph Hellwig
  Cc: Jens Axboe, Christian Brauner, Carlos Maiolino, Qu Wenruo,
	Al Viro, linux-block, linux-xfs, linux-fsdevel

On Mon, Jan 19, 2026 at 08:44:12AM +0100, Christoph Hellwig wrote:
> Add helpers to implement bounce buffering of data into a bio to implement
> direct I/O for cases where direct user access is not possible because
> stable in-flight data is required.  These are intended to be used as
> easily as bio_iov_iter_get_pages for the zero-copy path.
> 
> The write side is trivial and just copies data into the bounce buffer.
> The read side is a lot more complex because it needs to perform the copy
> from the completion context, and without preserving the iov_iter through
> the call chain.  It steals a trick from the integrity data user interface
> and uses the first vector in the bio for the bounce buffer data that is
> fed to the block I/O stack, and uses the others to record the user
> buffer fragments.
> 
> Signed-off-by: Christoph Hellwig <hch@lst.de>
> ---
>  block/bio.c         | 178 ++++++++++++++++++++++++++++++++++++++++++++
>  include/linux/bio.h |  26 +++++++
>  2 files changed, 204 insertions(+)
> 
> diff --git a/block/bio.c b/block/bio.c
> index c51b4e2470e2..da795b1df52a 100644
> --- a/block/bio.c
> +++ b/block/bio.c
> @@ -1266,6 +1266,184 @@ int bio_iov_iter_get_pages(struct bio *bio, struct iov_iter *iter,
>  	return bio_iov_iter_align_down(bio, iter, len_align_mask);
>  }
>  
> +static struct folio *folio_alloc_greedy(gfp_t gfp, size_t *size)
> +{
> +	struct folio *folio;
> +
> +	while (*size > PAGE_SIZE) {
> +		folio = folio_alloc(gfp | __GFP_NORETRY, get_order(*size));
> +		if (folio)
> +			return folio;
> +		*size = rounddown_pow_of_two(*size - 1);
> +	}
> +
> +	return folio_alloc(gfp, get_order(*size));
> +}

Hrm.  Should we combine this with the slightly different version that is
in xfs_healthmon?

/* Allocate as much memory as we can get for verification buffer. */
static struct folio *
xfs_verify_alloc_folio(
	const unsigned int	iosize)
{
	unsigned int		order = get_order(iosize);

	while (order > 0) {
		struct folio	*folio =
			folio_alloc(GFP_KERNEL | __GFP_NORETRY, order);

		if (folio)
			return folio;
		order--;
	}

	return folio_alloc(GFP_KERNEL, 0);
}

> +static void bio_free_folios(struct bio *bio)
> +{
> +	struct bio_vec *bv;
> +	int i;
> +
> +	bio_for_each_bvec_all(bv, bio, i) {
> +		struct folio *folio = page_folio(bv->bv_page);
> +
> +		if (!is_zero_folio(folio))
> +			folio_put(page_folio(bv->bv_page));

Isn't folio_put's argument just @folio again?

> +	}
> +}
> +
> +static int bio_iov_iter_bounce_write(struct bio *bio, struct iov_iter *iter)
> +{
> +	size_t total_len = iov_iter_count(iter);
> +
> +	if (WARN_ON_ONCE(bio_flagged(bio, BIO_CLONED)))
> +		return -EINVAL;
> +	if (WARN_ON_ONCE(bio->bi_iter.bi_size))
> +		return -EINVAL;
> +	if (WARN_ON_ONCE(bio->bi_vcnt >= bio->bi_max_vecs))
> +		return -EINVAL;
> +
> +	do {
> +		size_t this_len = min(total_len, SZ_1M);
> +		struct folio *folio;
> +
> +		if (this_len > PAGE_SIZE * 2)
> +			this_len = rounddown_pow_of_two(this_len);
> +
> +		if (bio->bi_iter.bi_size > UINT_MAX - this_len)

Now that I've seen UINT_MAX appear twice in terms of limiting bio size,
I wonder if that ought to be encoded as a constant somewhere?

#define BIO_ITER_MAX_SIZE	(UINT_MAX)

(apologies if I'm digging up some horrible old flamewar from the 1830s)

> +			break;
> +
> +		folio = folio_alloc_greedy(GFP_KERNEL, &this_len);
> +		if (!folio)
> +			break;
> +		bio_add_folio_nofail(bio, folio, this_len, 0);
> +
> +		if (copy_from_iter(folio_address(folio), this_len, iter) !=
> +				this_len) {
> +			bio_free_folios(bio);
> +			return -EFAULT;
> +		}
> +
> +		total_len -= this_len;
> +	} while (total_len && bio->bi_vcnt < bio->bi_max_vecs);
> +
> +	if (!bio->bi_iter.bi_size)
> +		return -ENOMEM;
> +	return 0;
> +}
> +
> +static int bio_iov_iter_bounce_read(struct bio *bio, struct iov_iter *iter)
> +{
> +	size_t len = min(iov_iter_count(iter), SZ_1M);
> +	struct folio *folio;
> +
> +	folio = folio_alloc_greedy(GFP_KERNEL, &len);
> +	if (!folio)
> +		return -ENOMEM;
> +
> +	do {
> +		ssize_t ret;
> +
> +		ret = iov_iter_extract_bvecs(iter, bio->bi_io_vec + 1, len,
> +				&bio->bi_vcnt, bio->bi_max_vecs - 1, 0);
> +		if (ret <= 0) {
> +			if (!bio->bi_vcnt)
> +				return ret;
> +			break;
> +		}
> +		len -= ret;
> +		bio->bi_iter.bi_size += ret;
> +	} while (len && bio->bi_vcnt < bio->bi_max_vecs - 1);
> +
> +	/*
> +	 * Set the folio directly here.  The above loop has already calculated
> +	 * the correct bi_size, and we use bi_vcnt for the user buffers.  That
> +	 * is safe as bi_vcnt is only for user by the submitter and not looked

"...for use by the submitter..." ?

> +	 * at by the actual I/O path.
> +	 */
> +	bvec_set_folio(&bio->bi_io_vec[0], folio, bio->bi_iter.bi_size, 0);
> +	if (iov_iter_extract_will_pin(iter))
> +		bio_set_flag(bio, BIO_PAGE_PINNED);
> +	return 0;
> +}
> +
> +/**
> + * bio_iov_iter_bounce - bounce buffer data from an iter into a bio
> + * @bio:	bio to send
> + * @iter:	iter to read from / write into
> + *
> + * Helper for direct I/O implementations that need to bounce buffer because
> + * we need to checksum the data or perform other operations that require
> + * consistency.  Allocates folios to back the bounce buffer, and for writes
> + * copies the data into it.  Needs to be paired with bio_iov_iter_unbounce()
> + * called on completion.
> + */
> +int bio_iov_iter_bounce(struct bio *bio, struct iov_iter *iter)
> +{
> +	if (op_is_write(bio_op(bio)))
> +		return bio_iov_iter_bounce_write(bio, iter);
> +	return bio_iov_iter_bounce_read(bio, iter);
> +}
> +
> +static void bvec_unpin(struct bio_vec *bv, bool mark_dirty)
> +{
> +	struct folio *folio = page_folio(bv->bv_page);
> +	size_t nr_pages = (bv->bv_offset + bv->bv_len - 1) / PAGE_SIZE -
> +			bv->bv_offset / PAGE_SIZE + 1;
> +
> +	if (mark_dirty)
> +		folio_mark_dirty_lock(folio);
> +	unpin_user_folio(folio, nr_pages);
> +}
> +
> +static void bio_iov_iter_unbounce_read(struct bio *bio, bool is_error,
> +		bool mark_dirty)
> +{
> +	unsigned int len = bio->bi_io_vec[0].bv_len;
> +
> +	if (likely(!is_error)) {
> +		void *buf = bvec_virt(&bio->bi_io_vec[0]);
> +		struct iov_iter to;
> +
> +		iov_iter_bvec(&to, ITER_DEST, bio->bi_io_vec + 1, bio->bi_vcnt,
> +				len);
> +		WARN_ON_ONCE(copy_to_iter(buf, len, &to) != len);

I wonder, under what circumstances would the copy_to_iter come up short?

Something evil like $program initiates a directio read from a PI disk, a
BPF guy starts screaming in a datacenter to wobble the disk, and that
gives a compromised systemd enough time to attach to $program with
ptrace to unmap a page in the middle of the read buffer before
bio_iov_iter_unbounce_read gets called?

--D

> +	} else {
> +		/* No need to mark folios dirty if never copied to them */
> +		mark_dirty = false;
> +	}
> +
> +	if (bio_flagged(bio, BIO_PAGE_PINNED)) {
> +		int i;
> +
> +		for (i = 0; i < bio->bi_vcnt; i++)
> +			bvec_unpin(&bio->bi_io_vec[1 + i], mark_dirty);
> +	}
> +
> +	folio_put(page_folio(bio->bi_io_vec[0].bv_page));
> +}
> +
> +/**
> + * bio_iov_iter_unbounce - finish a bounce buffer operation
> + * @bio:	completed bio
> + * @is_error:	%true if an I/O error occurred and data should not be copied
> + * @mark_dirty:	If %true, folios will be marked dirty.
> + *
> + * Helper for direct I/O implementations that need to bounce buffer because
> + * we need to checksum the data or perform other operations that require
> + * consistency.  Called to complete a bio set up by bio_iov_iter_bounce().
> + * Copies data back for reads, and marks the original folios dirty if
> + * requested and then frees the bounce buffer.
> + */
> +void bio_iov_iter_unbounce(struct bio *bio, bool is_error, bool mark_dirty)
> +{
> +	if (op_is_write(bio_op(bio)))
> +		bio_free_folios(bio);
> +	else
> +		bio_iov_iter_unbounce_read(bio, is_error, mark_dirty);
> +}
> +
>  static void submit_bio_wait_endio(struct bio *bio)
>  {
>  	complete(bio->bi_private);
> diff --git a/include/linux/bio.h b/include/linux/bio.h
> index c75a9b3672aa..95cfc79b88b8 100644
> --- a/include/linux/bio.h
> +++ b/include/linux/bio.h
> @@ -403,6 +403,29 @@ static inline int bio_iov_vecs_to_alloc(struct iov_iter *iter, int max_segs)
>  	return iov_iter_npages(iter, max_segs);
>  }
>  
> +/**
> + * bio_iov_bounce_nr_vecs - calculate number of bvecs for a bounce bio
> + * @iter:	iter to bounce from
> + * @op:		REQ_OP_* for the bio
> + *
> + * Calculates how many bvecs are needed for the next bio to bounce from/to
> + * @iter.
> + */
> +static inline unsigned short
> +bio_iov_bounce_nr_vecs(struct iov_iter *iter, blk_opf_t op)
> +{
> +	/*
> +	 * We still need to bounce bvec iters, so don't special case them
> +	 * here unlike in bio_iov_vecs_to_alloc.
> +	 *
> +	 * For reads we need to use a vector for the bounce buffer, account
> +	 * for that here.
> +	 */
> +	if (op_is_write(op))
> +		return iov_iter_npages(iter, BIO_MAX_VECS);
> +	return iov_iter_npages(iter, BIO_MAX_VECS - 1) + 1;
> +}
> +
>  struct request_queue;
>  
>  void bio_init(struct bio *bio, struct block_device *bdev, struct bio_vec *table,
> @@ -456,6 +479,9 @@ void __bio_release_pages(struct bio *bio, bool mark_dirty);
>  extern void bio_set_pages_dirty(struct bio *bio);
>  extern void bio_check_pages_dirty(struct bio *bio);
>  
> +int bio_iov_iter_bounce(struct bio *bio, struct iov_iter *iter);
> +void bio_iov_iter_unbounce(struct bio *bio, bool is_error, bool mark_dirty);
> +
>  extern void bio_copy_data_iter(struct bio *dst, struct bvec_iter *dst_iter,
>  			       struct bio *src, struct bvec_iter *src_iter);
>  extern void bio_copy_data(struct bio *dst, struct bio *src);
> -- 
> 2.47.3
> 
> 

^ permalink raw reply	[flat|nested] 75+ messages in thread

* Re: [PATCH 04/14] block: remove bio_release_page
  2026-01-19  7:44   ` [PATCH 04/14] block: remove bio_release_page Christoph Hellwig
  2026-01-22 11:14     ` Johannes Thumshirn
@ 2026-01-22 17:26     ` Darrick J. Wong
  2026-01-23  8:43     ` Damien Le Moal
  2026-01-23 12:17     ` Anuj Gupta
  3 siblings, 0 replies; 75+ messages in thread
From: Darrick J. Wong @ 2026-01-22 17:26 UTC (permalink / raw)
  To: Christoph Hellwig
  Cc: Jens Axboe, Christian Brauner, Carlos Maiolino, Qu Wenruo,
	Al Viro, linux-block, linux-xfs, linux-fsdevel

On Mon, Jan 19, 2026 at 08:44:11AM +0100, Christoph Hellwig wrote:
> Merge bio_release_page into the only remaining caller.
> 
> Signed-off-by: Christoph Hellwig <hch@lst.de>

Looks ok,
Reviewed-by: "Darrick J. Wong" <djwong@kernel.org>

--D

> ---
>  block/bio.c |  4 +++-
>  block/blk.h | 11 -----------
>  2 files changed, 3 insertions(+), 12 deletions(-)
> 
> diff --git a/block/bio.c b/block/bio.c
> index 12cd3c5f6d6d..c51b4e2470e2 100644
> --- a/block/bio.c
> +++ b/block/bio.c
> @@ -1195,7 +1195,9 @@ static int bio_iov_iter_align_down(struct bio *bio, struct iov_iter *iter,
>  			break;
>  		}
>  
> -		bio_release_page(bio, bv->bv_page);
> +		if (bio_flagged(bio, BIO_PAGE_PINNED))
> +			unpin_user_page(bv->bv_page);
> +
>  		bio->bi_vcnt--;
>  		nbytes -= bv->bv_len;
>  	} while (nbytes);
> diff --git a/block/blk.h b/block/blk.h
> index 980eef1f5690..886238cae5f1 100644
> --- a/block/blk.h
> +++ b/block/blk.h
> @@ -595,17 +595,6 @@ void bdev_set_nr_sectors(struct block_device *bdev, sector_t sectors);
>  
>  struct gendisk *__alloc_disk_node(struct request_queue *q, int node_id,
>  		struct lock_class_key *lkclass);
> -
> -/*
> - * Clean up a page appropriately, where the page may be pinned, may have a
> - * ref taken on it or neither.
> - */
> -static inline void bio_release_page(struct bio *bio, struct page *page)
> -{
> -	if (bio_flagged(bio, BIO_PAGE_PINNED))
> -		unpin_user_page(page);
> -}
> -
>  struct request_queue *blk_alloc_queue(struct queue_limits *lim, int node_id);
>  
>  int disk_scan_partitions(struct gendisk *disk, blk_mode_t mode);
> -- 
> 2.47.3
> 
> 

^ permalink raw reply	[flat|nested] 75+ messages in thread

* Re: [PATCH 03/14] iov_iter: extract a iov_iter_extract_bvecs helper from bio code
  2026-01-19  7:44   ` [PATCH 03/14] iov_iter: extract a iov_iter_extract_bvecs helper from bio code Christoph Hellwig
@ 2026-01-22 17:47     ` Darrick J. Wong
  2026-01-23  5:44       ` Christoph Hellwig
  2026-01-23 11:37     ` David Howells
  1 sibling, 1 reply; 75+ messages in thread
From: Darrick J. Wong @ 2026-01-22 17:47 UTC (permalink / raw)
  To: Christoph Hellwig
  Cc: Jens Axboe, Christian Brauner, Carlos Maiolino, Qu Wenruo,
	Al Viro, linux-block, linux-xfs, linux-fsdevel

On Mon, Jan 19, 2026 at 08:44:10AM +0100, Christoph Hellwig wrote:
> Massage __bio_iov_iter_get_pages so that it doesn't need the bio, and
> move it to lib/iov_iter.c so that it can be used by block code for
> other things than filling a bio and by other subsystems like netfs.
> 
> Signed-off-by: Christoph Hellwig <hch@lst.de>
> ---
>  block/bio.c         | 120 +++++++-------------------------------------
>  include/linux/uio.h |   3 ++
>  lib/iov_iter.c      |  98 ++++++++++++++++++++++++++++++++++++
>  3 files changed, 119 insertions(+), 102 deletions(-)
> 
> diff --git a/block/bio.c b/block/bio.c
> index 46ff33f4de04..12cd3c5f6d6d 100644
> --- a/block/bio.c
> +++ b/block/bio.c
> @@ -1172,102 +1172,6 @@ void bio_iov_bvec_set(struct bio *bio, const struct iov_iter *iter)
>  	bio_set_flag(bio, BIO_CLONED);
>  }
>  
> -static unsigned int get_contig_folio_len(struct page **pages,
> -					 unsigned int *num_pages, size_t left,
> -					 size_t offset)
> -{
> -	struct folio *folio = page_folio(pages[0]);
> -	size_t contig_sz = min_t(size_t, PAGE_SIZE - offset, left);
> -	unsigned int max_pages, i;
> -	size_t folio_offset, len;
> -
> -	folio_offset = PAGE_SIZE * folio_page_idx(folio, pages[0]) + offset;
> -	len = min(folio_size(folio) - folio_offset, left);
> -
> -	/*
> -	 * We might COW a single page in the middle of a large folio, so we have
> -	 * to check that all pages belong to the same folio.
> -	 */
> -	left -= contig_sz;
> -	max_pages = DIV_ROUND_UP(offset + len, PAGE_SIZE);
> -	for (i = 1; i < max_pages; i++) {
> -		size_t next = min_t(size_t, PAGE_SIZE, left);
> -
> -		if (page_folio(pages[i]) != folio ||
> -		    pages[i] != pages[i - 1] + 1)
> -			break;
> -		contig_sz += next;
> -		left -= next;
> -	}
> -
> -	*num_pages = i;
> -	return contig_sz;
> -}
> -
> -#define PAGE_PTRS_PER_BVEC     (sizeof(struct bio_vec) / sizeof(struct page *))
> -
> -/**
> - * __bio_iov_iter_get_pages - pin user or kernel pages and add them to a bio
> - * @bio: bio to add pages to
> - * @iter: iov iterator describing the region to be mapped
> - *
> - * Extracts pages from *iter and appends them to @bio's bvec array.  The pages
> - * will have to be cleaned up in the way indicated by the BIO_PAGE_PINNED flag.
> - * For a multi-segment *iter, this function only adds pages from the next
> - * non-empty segment of the iov iterator.
> - */
> -static ssize_t __bio_iov_iter_get_pages(struct bio *bio, struct iov_iter *iter)
> -{
> -	iov_iter_extraction_t extraction_flags = 0;
> -	unsigned short nr_pages = bio->bi_max_vecs - bio->bi_vcnt;
> -	unsigned short entries_left = bio->bi_max_vecs - bio->bi_vcnt;
> -	struct bio_vec *bv = bio->bi_io_vec + bio->bi_vcnt;
> -	struct page **pages = (struct page **)bv;

Huh.  We type-abuse an array of bio_vec's as an array of struct page
pointers??

As a straight hoist the patch looks correct but I'm confused about this.

--D

> -	ssize_t size;
> -	unsigned int i = 0;
> -	size_t offset, left, len;
> -
> -	/*
> -	 * Move page array up in the allocated memory for the bio vecs as far as
> -	 * possible so that we can start filling biovecs from the beginning
> -	 * without overwriting the temporary page array.
> -	 */
> -	BUILD_BUG_ON(PAGE_PTRS_PER_BVEC < 2);
> -	pages += entries_left * (PAGE_PTRS_PER_BVEC - 1);
> -
> -	if (bio->bi_bdev && blk_queue_pci_p2pdma(bio->bi_bdev->bd_disk->queue))
> -		extraction_flags |= ITER_ALLOW_P2PDMA;
> -
> -	size = iov_iter_extract_pages(iter, &pages,
> -				      UINT_MAX - bio->bi_iter.bi_size,
> -				      nr_pages, extraction_flags, &offset);
> -	if (unlikely(size <= 0))
> -		return size ? size : -EFAULT;
> -
> -	nr_pages = DIV_ROUND_UP(offset + size, PAGE_SIZE);
> -	for (left = size; left > 0; left -= len) {
> -		unsigned int nr_to_add;
> -
> -		if (bio->bi_vcnt > 0) {
> -			struct bio_vec *prev = &bio->bi_io_vec[bio->bi_vcnt - 1];
> -
> -			if (!zone_device_pages_have_same_pgmap(prev->bv_page,
> -					pages[i]))
> -				break;
> -		}
> -
> -		len = get_contig_folio_len(&pages[i], &nr_to_add, left, offset);
> -		__bio_add_page(bio, pages[i], len, offset);
> -		i += nr_to_add;
> -		offset = 0;
> -	}
> -
> -	iov_iter_revert(iter, left);
> -	while (i < nr_pages)
> -		bio_release_page(bio, pages[i++]);
> -	return size - left;
> -}
> -
>  /*
>   * Aligns the bio size to the len_align_mask, releasing excessive bio vecs that
>   * __bio_iov_iter_get_pages may have inserted, and reverts the trimmed length
> @@ -1325,7 +1229,7 @@ static int bio_iov_iter_align_down(struct bio *bio, struct iov_iter *iter,
>  int bio_iov_iter_get_pages(struct bio *bio, struct iov_iter *iter,
>  			   unsigned len_align_mask)
>  {
> -	ssize_t ret;
> +	iov_iter_extraction_t flags = 0;
>  
>  	if (WARN_ON_ONCE(bio_flagged(bio, BIO_CLONED)))
>  		return -EIO;
> @@ -1338,14 +1242,26 @@ int bio_iov_iter_get_pages(struct bio *bio, struct iov_iter *iter,
>  
>  	if (iov_iter_extract_will_pin(iter))
>  		bio_set_flag(bio, BIO_PAGE_PINNED);
> +	if (bio->bi_bdev && blk_queue_pci_p2pdma(bio->bi_bdev->bd_disk->queue))
> +		flags |= ITER_ALLOW_P2PDMA;
>  
>  	do {
> -		ret = __bio_iov_iter_get_pages(bio, iter);
> -	} while (ret > 0 && iov_iter_count(iter) && !bio_full(bio, 0));
> +		ssize_t ret;
> +
> +		ret = iov_iter_extract_bvecs(iter, bio->bi_io_vec,
> +				UINT_MAX - bio->bi_iter.bi_size, &bio->bi_vcnt,
> +				bio->bi_max_vecs, flags);
> +		if (ret <= 0) {
> +			if (!bio->bi_vcnt)
> +				return ret;
> +			break;
> +		}
> +		bio->bi_iter.bi_size += ret;
> +	} while (iov_iter_count(iter) && !bio_full(bio, 0));
>  
> -	if (bio->bi_vcnt)
> -		return bio_iov_iter_align_down(bio, iter, len_align_mask);
> -	return ret;
> +	if (is_pci_p2pdma_page(bio->bi_io_vec->bv_page))
> +		bio->bi_opf |= REQ_NOMERGE;
> +	return bio_iov_iter_align_down(bio, iter, len_align_mask);
>  }
>  
>  static void submit_bio_wait_endio(struct bio *bio)
> diff --git a/include/linux/uio.h b/include/linux/uio.h
> index 5b127043a151..a9bc5b3067e3 100644
> --- a/include/linux/uio.h
> +++ b/include/linux/uio.h
> @@ -389,6 +389,9 @@ ssize_t iov_iter_extract_pages(struct iov_iter *i, struct page ***pages,
>  			       size_t maxsize, unsigned int maxpages,
>  			       iov_iter_extraction_t extraction_flags,
>  			       size_t *offset0);
> +ssize_t iov_iter_extract_bvecs(struct iov_iter *iter, struct bio_vec *bv,
> +		size_t max_size, unsigned short *nr_vecs,
> +		unsigned short max_vecs, iov_iter_extraction_t extraction_flags);
>  
>  /**
>   * iov_iter_extract_will_pin - Indicate how pages from the iterator will be retained
> diff --git a/lib/iov_iter.c b/lib/iov_iter.c
> index 896760bad455..545250507f08 100644
> --- a/lib/iov_iter.c
> +++ b/lib/iov_iter.c
> @@ -1845,3 +1845,101 @@ ssize_t iov_iter_extract_pages(struct iov_iter *i,
>  	return -EFAULT;
>  }
>  EXPORT_SYMBOL_GPL(iov_iter_extract_pages);
> +
> +static unsigned int get_contig_folio_len(struct page **pages,
> +		unsigned int *num_pages, size_t left, size_t offset)
> +{
> +	struct folio *folio = page_folio(pages[0]);
> +	size_t contig_sz = min_t(size_t, PAGE_SIZE - offset, left);
> +	unsigned int max_pages, i;
> +	size_t folio_offset, len;
> +
> +	folio_offset = PAGE_SIZE * folio_page_idx(folio, pages[0]) + offset;
> +	len = min(folio_size(folio) - folio_offset, left);
> +
> +	/*
> +	 * We might COW a single page in the middle of a large folio, so we have
> +	 * to check that all pages belong to the same folio.
> +	 */
> +	left -= contig_sz;
> +	max_pages = DIV_ROUND_UP(offset + len, PAGE_SIZE);
> +	for (i = 1; i < max_pages; i++) {
> +		size_t next = min_t(size_t, PAGE_SIZE, left);
> +
> +		if (page_folio(pages[i]) != folio ||
> +		    pages[i] != pages[i - 1] + 1)
> +			break;
> +		contig_sz += next;
> +		left -= next;
> +	}
> +
> +	*num_pages = i;
> +	return contig_sz;
> +}
> +
> +#define PAGE_PTRS_PER_BVEC     (sizeof(struct bio_vec) / sizeof(struct page *))
> +
> +/**
> + * iov_iter_extract_bvecs - Extract bvecs from an iterator
> + * @iter:	the iterator to extract from
> + * @bv:		bvec return array
> + * @max_size:	maximum size to extract from @iter
> + * @nr_vecs:	number of vectors in @bv (on in and output)
> + * @max_vecs:	maximum vectors in @bv, including those filled before calling
> + * @extraction_flags: flags to qualify request
> + *
> + * Like iov_iter_extract_pages(), but returns physically contiguous ranges
> + * contained in a single folio as a single bvec instead of multiple entries.
> + *
> + * Returns the number of bytes extracted when successful, or a negative errno.
> + * If @nr_vecs was non-zero on entry, the number of successfully extracted bytes
> + * can be 0.
> + */
> +ssize_t iov_iter_extract_bvecs(struct iov_iter *iter, struct bio_vec *bv,
> +		size_t max_size, unsigned short *nr_vecs,
> +		unsigned short max_vecs, iov_iter_extraction_t extraction_flags)
> +{
> +	unsigned short entries_left = max_vecs - *nr_vecs;
> +	unsigned short nr_pages, i = 0;
> +	size_t left, offset, len;
> +	struct page **pages;
> +	ssize_t size;
> +
> +	/*
> +	 * Move page array up in the allocated memory for the bio vecs as far as
> +	 * possible so that we can start filling biovecs from the beginning
> +	 * without overwriting the temporary page array.
> +	 */
> +	BUILD_BUG_ON(PAGE_PTRS_PER_BVEC < 2);
> +	pages = (struct page **)(bv + *nr_vecs) +
> +		entries_left * (PAGE_PTRS_PER_BVEC - 1);
> +
> +	size = iov_iter_extract_pages(iter, &pages, max_size, entries_left,
> +			extraction_flags, &offset);
> +	if (unlikely(size <= 0))
> +		return size ? size : -EFAULT;
> +
> +	nr_pages = DIV_ROUND_UP(offset + size, PAGE_SIZE);
> +	for (left = size; left > 0; left -= len) {
> +		unsigned int nr_to_add;
> +
> +		if (*nr_vecs > 0 &&
> +		    !zone_device_pages_have_same_pgmap(bv[*nr_vecs - 1].bv_page,
> +				pages[i]))
> +			break;
> +
> +		len = get_contig_folio_len(&pages[i], &nr_to_add, left, offset);
> +		bvec_set_page(&bv[*nr_vecs], pages[i], len, offset);
> +		i += nr_to_add;
> +		(*nr_vecs)++;
> +		offset = 0;
> +	}
> +
> +	iov_iter_revert(iter, left);
> +	if (iov_iter_extract_will_pin(iter)) {
> +		while (i < nr_pages)
> +			unpin_user_page(pages[i++]);
> +	}
> +	return size - left;
> +}
> +EXPORT_SYMBOL_GPL(iov_iter_extract_bvecs);
> -- 
> 2.47.3
> 
> 

^ permalink raw reply	[flat|nested] 75+ messages in thread

* Re: [PATCH 01/14] block: refactor get_contig_folio_len
  2026-01-19  7:44   ` [PATCH 01/14] block: refactor get_contig_folio_len Christoph Hellwig
  2026-01-22 11:00     ` Johannes Thumshirn
@ 2026-01-22 17:54     ` Darrick J. Wong
  2026-01-23  8:32     ` Damien Le Moal
                       ` (2 subsequent siblings)
  4 siblings, 0 replies; 75+ messages in thread
From: Darrick J. Wong @ 2026-01-22 17:54 UTC (permalink / raw)
  To: Christoph Hellwig
  Cc: Jens Axboe, Christian Brauner, Carlos Maiolino, Qu Wenruo,
	Al Viro, linux-block, linux-xfs, linux-fsdevel

On Mon, Jan 19, 2026 at 08:44:08AM +0100, Christoph Hellwig wrote:
> Move all of the logic to find the contigous length inside a folio into
> get_contig_folio_len instead of keeping some of it in the caller.
> 
> Signed-off-by: Christoph Hellwig <hch@lst.de>

I like that this change makes it easier for me to guess what
get_contig_folio_len does just by looking at the arguments.

Reviewed-by: "Darrick J. Wong" <djwong@kernel.org>

--D

> ---
>  block/bio.c | 62 +++++++++++++++++++++++------------------------------
>  1 file changed, 27 insertions(+), 35 deletions(-)
> 
> diff --git a/block/bio.c b/block/bio.c
> index 2359c0723b88..18dfdaba0c73 100644
> --- a/block/bio.c
> +++ b/block/bio.c
> @@ -1172,33 +1172,35 @@ void bio_iov_bvec_set(struct bio *bio, const struct iov_iter *iter)
>  	bio_set_flag(bio, BIO_CLONED);
>  }
>  
> -static unsigned int get_contig_folio_len(unsigned int *num_pages,
> -					 struct page **pages, unsigned int i,
> -					 struct folio *folio, size_t left,
> +static unsigned int get_contig_folio_len(struct page **pages,
> +					 unsigned int *num_pages, size_t left,
>  					 size_t offset)
>  {
> -	size_t bytes = left;
> -	size_t contig_sz = min_t(size_t, PAGE_SIZE - offset, bytes);
> -	unsigned int j;
> +	struct folio *folio = page_folio(pages[0]);
> +	size_t contig_sz = min_t(size_t, PAGE_SIZE - offset, left);
> +	unsigned int max_pages, i;
> +	size_t folio_offset, len;
> +
> +	folio_offset = PAGE_SIZE * folio_page_idx(folio, pages[0]) + offset;
> +	len = min(folio_size(folio) - folio_offset, left);
>  
>  	/*
> -	 * We might COW a single page in the middle of
> -	 * a large folio, so we have to check that all
> -	 * pages belong to the same folio.
> +	 * We might COW a single page in the middle of a large folio, so we have
> +	 * to check that all pages belong to the same folio.
>  	 */
> -	bytes -= contig_sz;
> -	for (j = i + 1; j < i + *num_pages; j++) {
> -		size_t next = min_t(size_t, PAGE_SIZE, bytes);
> +	left -= contig_sz;
> +	max_pages = DIV_ROUND_UP(offset + len, PAGE_SIZE);
> +	for (i = 1; i < max_pages; i++) {
> +		size_t next = min_t(size_t, PAGE_SIZE, left);
>  
> -		if (page_folio(pages[j]) != folio ||
> -		    pages[j] != pages[j - 1] + 1) {
> +		if (page_folio(pages[i]) != folio ||
> +		    pages[i] != pages[i - 1] + 1)
>  			break;
> -		}
>  		contig_sz += next;
> -		bytes -= next;
> +		left -= next;
>  	}
> -	*num_pages = j - i;
>  
> +	*num_pages = i;
>  	return contig_sz;
>  }
>  
> @@ -1222,8 +1224,8 @@ static int __bio_iov_iter_get_pages(struct bio *bio, struct iov_iter *iter)
>  	struct bio_vec *bv = bio->bi_io_vec + bio->bi_vcnt;
>  	struct page **pages = (struct page **)bv;
>  	ssize_t size;
> -	unsigned int num_pages, i = 0;
> -	size_t offset, folio_offset, left, len;
> +	unsigned int i = 0;
> +	size_t offset, left, len;
>  	int ret = 0;
>  
>  	/*
> @@ -1244,23 +1246,12 @@ static int __bio_iov_iter_get_pages(struct bio *bio, struct iov_iter *iter)
>  		return size ? size : -EFAULT;
>  
>  	nr_pages = DIV_ROUND_UP(offset + size, PAGE_SIZE);
> -	for (left = size, i = 0; left > 0; left -= len, i += num_pages) {
> -		struct page *page = pages[i];
> -		struct folio *folio = page_folio(page);
> +	for (left = size; left > 0; left -= len) {
>  		unsigned int old_vcnt = bio->bi_vcnt;
> +		unsigned int nr_to_add;
>  
> -		folio_offset = ((size_t)folio_page_idx(folio, page) <<
> -			       PAGE_SHIFT) + offset;
> -
> -		len = min(folio_size(folio) - folio_offset, left);
> -
> -		num_pages = DIV_ROUND_UP(offset + len, PAGE_SIZE);
> -
> -		if (num_pages > 1)
> -			len = get_contig_folio_len(&num_pages, pages, i,
> -						   folio, left, offset);
> -
> -		if (!bio_add_folio(bio, folio, len, folio_offset)) {
> +		len = get_contig_folio_len(&pages[i], &nr_to_add, left, offset);
> +		if (!bio_add_page(bio, pages[i], len, offset)) {
>  			WARN_ON_ONCE(1);
>  			ret = -EINVAL;
>  			goto out;
> @@ -1275,8 +1266,9 @@ static int __bio_iov_iter_get_pages(struct bio *bio, struct iov_iter *iter)
>  			 * single pin per page.
>  			 */
>  			if (offset && bio->bi_vcnt == old_vcnt)
> -				unpin_user_folio(folio, 1);
> +				unpin_user_folio(page_folio(pages[i]), 1);
>  		}
> +		i += nr_to_add;
>  		offset = 0;
>  	}
>  
> -- 
> 2.47.3
> 
> 

^ permalink raw reply	[flat|nested] 75+ messages in thread

* Re: [PATCH 02/14] block: open code bio_add_page and fix handling of mismatching P2P ranges
  2026-01-19  7:44   ` [PATCH 02/14] block: open code bio_add_page and fix handling of mismatching P2P ranges Christoph Hellwig
  2026-01-22 11:04     ` Johannes Thumshirn
@ 2026-01-22 17:59     ` Darrick J. Wong
  2026-01-23  5:43       ` Christoph Hellwig
  2026-01-23  8:35     ` Damien Le Moal
  2026-01-23 12:15     ` Anuj Gupta
  3 siblings, 1 reply; 75+ messages in thread
From: Darrick J. Wong @ 2026-01-22 17:59 UTC (permalink / raw)
  To: Christoph Hellwig
  Cc: Jens Axboe, Christian Brauner, Carlos Maiolino, Qu Wenruo,
	Al Viro, linux-block, linux-xfs, linux-fsdevel

On Mon, Jan 19, 2026 at 08:44:09AM +0100, Christoph Hellwig wrote:
> bio_add_page fails to add data to the bio when mixing P2P with non-P2P
> ranges, or ranges that map to different P2P providers.  In that case
> it will trigger that WARN_ON and return an error up the chain instead of
> simply starting a new bio as intended.  Fix this by open coding

AFAICT we've already done all the other checks in bio_add_page, so
calling __bio_add_page directly from within the loop is ok since you've
explicitly handled the !zone_device_pages_have_same_pgmap() case.

> bio_add_page and handling this case explicitly.  While doing so, stop
> merging physical contiguous data that belongs to multiple folios.  While
> this merge could lead to more efficient bio packing in some case,
> dropping will allow to remove handling of this corner case in other
> places and make the code more robust.

That does sound like a landmine waiting to go off...

> Signed-off-by: Christoph Hellwig <hch@lst.de>

Looks good to me,
Reviewed-by: "Darrick J. Wong" <djwong@kernel.org>

--D

> ---
>  block/bio.c | 37 +++++++++++++------------------------
>  1 file changed, 13 insertions(+), 24 deletions(-)
> 
> diff --git a/block/bio.c b/block/bio.c
> index 18dfdaba0c73..46ff33f4de04 100644
> --- a/block/bio.c
> +++ b/block/bio.c
> @@ -1216,7 +1216,7 @@ static unsigned int get_contig_folio_len(struct page **pages,
>   * For a multi-segment *iter, this function only adds pages from the next
>   * non-empty segment of the iov iterator.
>   */
> -static int __bio_iov_iter_get_pages(struct bio *bio, struct iov_iter *iter)
> +static ssize_t __bio_iov_iter_get_pages(struct bio *bio, struct iov_iter *iter)
>  {
>  	iov_iter_extraction_t extraction_flags = 0;
>  	unsigned short nr_pages = bio->bi_max_vecs - bio->bi_vcnt;
> @@ -1226,7 +1226,6 @@ static int __bio_iov_iter_get_pages(struct bio *bio, struct iov_iter *iter)
>  	ssize_t size;
>  	unsigned int i = 0;
>  	size_t offset, left, len;
> -	int ret = 0;
>  
>  	/*
>  	 * Move page array up in the allocated memory for the bio vecs as far as
> @@ -1247,37 +1246,26 @@ static int __bio_iov_iter_get_pages(struct bio *bio, struct iov_iter *iter)
>  
>  	nr_pages = DIV_ROUND_UP(offset + size, PAGE_SIZE);
>  	for (left = size; left > 0; left -= len) {
> -		unsigned int old_vcnt = bio->bi_vcnt;
>  		unsigned int nr_to_add;
>  
> -		len = get_contig_folio_len(&pages[i], &nr_to_add, left, offset);
> -		if (!bio_add_page(bio, pages[i], len, offset)) {
> -			WARN_ON_ONCE(1);
> -			ret = -EINVAL;
> -			goto out;
> -		}
> +		if (bio->bi_vcnt > 0) {
> +			struct bio_vec *prev = &bio->bi_io_vec[bio->bi_vcnt - 1];
>  
> -		if (bio_flagged(bio, BIO_PAGE_PINNED)) {
> -			/*
> -			 * We're adding another fragment of a page that already
> -			 * was part of the last segment.  Undo our pin as the
> -			 * page was pinned when an earlier fragment of it was
> -			 * added to the bio and __bio_release_pages expects a
> -			 * single pin per page.
> -			 */
> -			if (offset && bio->bi_vcnt == old_vcnt)
> -				unpin_user_folio(page_folio(pages[i]), 1);
> +			if (!zone_device_pages_have_same_pgmap(prev->bv_page,
> +					pages[i]))
> +				break;
>  		}
> +
> +		len = get_contig_folio_len(&pages[i], &nr_to_add, left, offset);
> +		__bio_add_page(bio, pages[i], len, offset);
>  		i += nr_to_add;
>  		offset = 0;
>  	}
>  
>  	iov_iter_revert(iter, left);
> -out:
>  	while (i < nr_pages)
>  		bio_release_page(bio, pages[i++]);
> -
> -	return ret;
> +	return size - left;
>  }
>  
>  /*
> @@ -1337,7 +1325,7 @@ static int bio_iov_iter_align_down(struct bio *bio, struct iov_iter *iter,
>  int bio_iov_iter_get_pages(struct bio *bio, struct iov_iter *iter,
>  			   unsigned len_align_mask)
>  {
> -	int ret = 0;
> +	ssize_t ret;
>  
>  	if (WARN_ON_ONCE(bio_flagged(bio, BIO_CLONED)))
>  		return -EIO;
> @@ -1350,9 +1338,10 @@ int bio_iov_iter_get_pages(struct bio *bio, struct iov_iter *iter,
>  
>  	if (iov_iter_extract_will_pin(iter))
>  		bio_set_flag(bio, BIO_PAGE_PINNED);
> +
>  	do {
>  		ret = __bio_iov_iter_get_pages(bio, iter);
> -	} while (!ret && iov_iter_count(iter) && !bio_full(bio, 0));
> +	} while (ret > 0 && iov_iter_count(iter) && !bio_full(bio, 0));
>  
>  	if (bio->bi_vcnt)
>  		return bio_iov_iter_align_down(bio, iter, len_align_mask);
> -- 
> 2.47.3
> 
> 

^ permalink raw reply	[flat|nested] 75+ messages in thread

* Re: [PATCH 02/14] block: open code bio_add_page and fix handling of mismatching P2P ranges
  2026-01-22 17:59     ` Darrick J. Wong
@ 2026-01-23  5:43       ` Christoph Hellwig
  2026-01-23  7:05         ` Darrick J. Wong
  0 siblings, 1 reply; 75+ messages in thread
From: Christoph Hellwig @ 2026-01-23  5:43 UTC (permalink / raw)
  To: Darrick J. Wong
  Cc: Christoph Hellwig, Jens Axboe, Christian Brauner, Carlos Maiolino,
	Qu Wenruo, Al Viro, linux-block, linux-xfs, linux-fsdevel

On Thu, Jan 22, 2026 at 09:59:08AM -0800, Darrick J. Wong wrote:
> On Mon, Jan 19, 2026 at 08:44:09AM +0100, Christoph Hellwig wrote:
> > bio_add_page fails to add data to the bio when mixing P2P with non-P2P
> > ranges, or ranges that map to different P2P providers.  In that case
> > it will trigger that WARN_ON and return an error up the chain instead of
> > simply starting a new bio as intended.  Fix this by open coding
> 
> AFAICT we've already done all the other checks in bio_add_page, so
> calling __bio_add_page directly from within the loop is ok since you've
> explicitly handled the !zone_device_pages_have_same_pgmap() case.
> 
> > bio_add_page and handling this case explicitly.  While doing so, stop
> > merging physical contiguous data that belongs to multiple folios.  While
> > this merge could lead to more efficient bio packing in some case,
> > dropping will allow to remove handling of this corner case in other
> > places and make the code more robust.
> 
> That does sound like a landmine waiting to go off...

What?  Removing the handling?


^ permalink raw reply	[flat|nested] 75+ messages in thread

* Re: [PATCH 03/14] iov_iter: extract a iov_iter_extract_bvecs helper from bio code
  2026-01-22 17:47     ` Darrick J. Wong
@ 2026-01-23  5:44       ` Christoph Hellwig
  2026-01-23  7:09         ` Darrick J. Wong
  0 siblings, 1 reply; 75+ messages in thread
From: Christoph Hellwig @ 2026-01-23  5:44 UTC (permalink / raw)
  To: Darrick J. Wong
  Cc: Christoph Hellwig, Jens Axboe, Christian Brauner, Carlos Maiolino,
	Qu Wenruo, Al Viro, linux-block, linux-xfs, linux-fsdevel

On Thu, Jan 22, 2026 at 09:47:03AM -0800, Darrick J. Wong wrote:
> > -	struct page **pages = (struct page **)bv;
> 
> Huh.  We type-abuse an array of bio_vec's as an array of struct page
> pointers??
> 
> As a straight hoist the patch looks correct but I'm confused about this.

Yes.  This uses the larger space allocated for bio_vecs to first
place the pages at the end, and then filling in the bio_vecs from the
beginning.  I think the comments describe it pretty well, but if you
have ideas for enhancement, this might be a good time to update them.


^ permalink raw reply	[flat|nested] 75+ messages in thread

* Re: [PATCH 05/14] block: add helpers to bounce buffer an iov_iter into bios
  2026-01-22 17:25     ` Darrick J. Wong
@ 2026-01-23  5:51       ` Christoph Hellwig
  2026-01-23  7:11         ` Darrick J. Wong
  0 siblings, 1 reply; 75+ messages in thread
From: Christoph Hellwig @ 2026-01-23  5:51 UTC (permalink / raw)
  To: Darrick J. Wong
  Cc: Christoph Hellwig, Jens Axboe, Christian Brauner, Carlos Maiolino,
	Qu Wenruo, Al Viro, linux-block, linux-xfs, linux-fsdevel

On Thu, Jan 22, 2026 at 09:25:56AM -0800, Darrick J. Wong wrote:
> Hrm.  Should we combine this with the slightly different version that is
> in xfs_healthmon?

Yes, but not now.  I'd rather not introduce a three-way cross tree
dependency with bike shedding potential right now.  Let's look at this
once we have the two versions in tree, and also look out for others.

> > +static void bio_free_folios(struct bio *bio)
> > +{
> > +	struct bio_vec *bv;
> > +	int i;
> > +
> > +	bio_for_each_bvec_all(bv, bio, i) {
> > +		struct folio *folio = page_folio(bv->bv_page);
> > +
> > +		if (!is_zero_folio(folio))
> > +			folio_put(page_folio(bv->bv_page));
> 
> Isn't folio_put's argument just @folio again?

Yes, I'll clean this up.

> > +		if (this_len > PAGE_SIZE * 2)
> > +			this_len = rounddown_pow_of_two(this_len);
> > +
> > +		if (bio->bi_iter.bi_size > UINT_MAX - this_len)
> 
> Now that I've seen UINT_MAX appear twice in terms of limiting bio size,
> I wonder if that ought to be encoded as a constant somewhere?
> 
> #define BIO_ITER_MAX_SIZE	(UINT_MAX)
> 
> (apologies if I'm digging up some horrible old flamewar from the 1830s)

Heh.  I don't remember any flame wars, but maybe that's just because my
memory sucks.  I guess this would be more like:

define BVEC_ITER_MAX_SIZE	sizeof_field(struct bvec_iter, bi_size)

though.
	
> > +	} while (len && bio->bi_vcnt < bio->bi_max_vecs - 1);
> > +
> > +	/*
> > +	 * Set the folio directly here.  The above loop has already calculated
> > +	 * the correct bi_size, and we use bi_vcnt for the user buffers.  That
> > +	 * is safe as bi_vcnt is only for user by the submitter and not looked
> 
> "...for use by the submitter..." ?

Yes.

> > +	if (likely(!is_error)) {
> > +		void *buf = bvec_virt(&bio->bi_io_vec[0]);
> > +		struct iov_iter to;
> > +
> > +		iov_iter_bvec(&to, ITER_DEST, bio->bi_io_vec + 1, bio->bi_vcnt,
> > +				len);
> > +		WARN_ON_ONCE(copy_to_iter(buf, len, &to) != len);
> 
> I wonder, under what circumstances would the copy_to_iter come up short?
> 
> Something evil like $program initiates a directio read from a PI disk, a
> BPF guy starts screaming in a datacenter to wobble the disk, and that
> gives a compromised systemd enough time to attach to $program with
> ptrace to unmap a page in the middle of the read buffer before
> bio_iov_iter_unbounce_read gets called?

I don't think it can at all.   Remember, this is not directly copying
to the user iter, but to the bvec array pointing to pinned pages,
which are not going away. 


^ permalink raw reply	[flat|nested] 75+ messages in thread

* Re: [PATCH 02/14] block: open code bio_add_page and fix handling of mismatching P2P ranges
  2026-01-23  5:43       ` Christoph Hellwig
@ 2026-01-23  7:05         ` Darrick J. Wong
  0 siblings, 0 replies; 75+ messages in thread
From: Darrick J. Wong @ 2026-01-23  7:05 UTC (permalink / raw)
  To: Christoph Hellwig
  Cc: Jens Axboe, Christian Brauner, Carlos Maiolino, Qu Wenruo,
	Al Viro, linux-block, linux-xfs, linux-fsdevel

On Fri, Jan 23, 2026 at 06:43:14AM +0100, Christoph Hellwig wrote:
> On Thu, Jan 22, 2026 at 09:59:08AM -0800, Darrick J. Wong wrote:
> > On Mon, Jan 19, 2026 at 08:44:09AM +0100, Christoph Hellwig wrote:
> > > bio_add_page fails to add data to the bio when mixing P2P with non-P2P
> > > ranges, or ranges that map to different P2P providers.  In that case
> > > it will trigger that WARN_ON and return an error up the chain instead of
> > > simply starting a new bio as intended.  Fix this by open coding
> > 
> > AFAICT we've already done all the other checks in bio_add_page, so
> > calling __bio_add_page directly from within the loop is ok since you've
> > explicitly handled the !zone_device_pages_have_same_pgmap() case.
> > 
> > > bio_add_page and handling this case explicitly.  While doing so, stop
> > > merging physical contiguous data that belongs to multiple folios.  While
> > > this merge could lead to more efficient bio packing in some case,
> > > dropping will allow to remove handling of this corner case in other
> > > places and make the code more robust.
> > 
> > That does sound like a landmine waiting to go off...
> 
> What?  Removing the handling?

Urk, sorry.  I meant to say that the *old code* combining pages from
multiple folios sounded like a landmine waiting to go off.

--D

^ permalink raw reply	[flat|nested] 75+ messages in thread

* Re: [PATCH 03/14] iov_iter: extract a iov_iter_extract_bvecs helper from bio code
  2026-01-23  5:44       ` Christoph Hellwig
@ 2026-01-23  7:09         ` Darrick J. Wong
  2026-01-23  7:14           ` Christoph Hellwig
  0 siblings, 1 reply; 75+ messages in thread
From: Darrick J. Wong @ 2026-01-23  7:09 UTC (permalink / raw)
  To: Christoph Hellwig
  Cc: Jens Axboe, Christian Brauner, Carlos Maiolino, Qu Wenruo,
	Al Viro, linux-block, linux-xfs, linux-fsdevel

On Fri, Jan 23, 2026 at 06:44:48AM +0100, Christoph Hellwig wrote:
> On Thu, Jan 22, 2026 at 09:47:03AM -0800, Darrick J. Wong wrote:
> > > -	struct page **pages = (struct page **)bv;
> > 
> > Huh.  We type-abuse an array of bio_vec's as an array of struct page
> > pointers??
> > 
> > As a straight hoist the patch looks correct but I'm confused about this.
> 
> Yes.  This uses the larger space allocated for bio_vecs to first
> place the pages at the end, and then filling in the bio_vecs from the
> beginning.  I think the comments describe it pretty well, but if you
> have ideas for enhancement, this might be a good time to update them.

I'm not sure, since the alternative is to wrap the whole mess in a
union, which makes the type-abuse more explicit but then is still pretty
ugly.  The only improvement I can really think of would be a huge
comment wherever we start this, which I think Kent's original code from
2011 had ("deep magic", etc).

--D

^ permalink raw reply	[flat|nested] 75+ messages in thread

* Re: [PATCH 05/14] block: add helpers to bounce buffer an iov_iter into bios
  2026-01-23  5:51       ` Christoph Hellwig
@ 2026-01-23  7:11         ` Darrick J. Wong
  2026-01-23  7:16           ` Christoph Hellwig
  0 siblings, 1 reply; 75+ messages in thread
From: Darrick J. Wong @ 2026-01-23  7:11 UTC (permalink / raw)
  To: Christoph Hellwig
  Cc: Jens Axboe, Christian Brauner, Carlos Maiolino, Qu Wenruo,
	Al Viro, linux-block, linux-xfs, linux-fsdevel

On Fri, Jan 23, 2026 at 06:51:28AM +0100, Christoph Hellwig wrote:
> On Thu, Jan 22, 2026 at 09:25:56AM -0800, Darrick J. Wong wrote:
> > Hrm.  Should we combine this with the slightly different version that is
> > in xfs_healthmon?
> 
> Yes, but not now.  I'd rather not introduce a three-way cross tree
> dependency with bike shedding potential right now.  Let's look at this
> once we have the two versions in tree, and also look out for others.

<nod>

> > > +static void bio_free_folios(struct bio *bio)
> > > +{
> > > +	struct bio_vec *bv;
> > > +	int i;
> > > +
> > > +	bio_for_each_bvec_all(bv, bio, i) {
> > > +		struct folio *folio = page_folio(bv->bv_page);
> > > +
> > > +		if (!is_zero_folio(folio))
> > > +			folio_put(page_folio(bv->bv_page));
> > 
> > Isn't folio_put's argument just @folio again?
> 
> Yes, I'll clean this up.
> 
> > > +		if (this_len > PAGE_SIZE * 2)
> > > +			this_len = rounddown_pow_of_two(this_len);
> > > +
> > > +		if (bio->bi_iter.bi_size > UINT_MAX - this_len)
> > 
> > Now that I've seen UINT_MAX appear twice in terms of limiting bio size,
> > I wonder if that ought to be encoded as a constant somewhere?
> > 
> > #define BIO_ITER_MAX_SIZE	(UINT_MAX)
> > 
> > (apologies if I'm digging up some horrible old flamewar from the 1830s)
> 
> Heh.  I don't remember any flame wars, but maybe that's just because my
> memory sucks.

Well it's not like I'm highly incentivized to remember misinteractions
on the mailing lists... :D

>  I guess this would be more like:
> 
> define BVEC_ITER_MAX_SIZE	sizeof_field(struct bvec_iter, bi_size)
> 
> though.

Hrmm, that might be better.

> > > +	} while (len && bio->bi_vcnt < bio->bi_max_vecs - 1);
> > > +
> > > +	/*
> > > +	 * Set the folio directly here.  The above loop has already calculated
> > > +	 * the correct bi_size, and we use bi_vcnt for the user buffers.  That
> > > +	 * is safe as bi_vcnt is only for user by the submitter and not looked
> > 
> > "...for use by the submitter..." ?
> 
> Yes.
> 
> > > +	if (likely(!is_error)) {
> > > +		void *buf = bvec_virt(&bio->bi_io_vec[0]);
> > > +		struct iov_iter to;
> > > +
> > > +		iov_iter_bvec(&to, ITER_DEST, bio->bi_io_vec + 1, bio->bi_vcnt,
> > > +				len);
> > > +		WARN_ON_ONCE(copy_to_iter(buf, len, &to) != len);
> > 
> > I wonder, under what circumstances would the copy_to_iter come up short?
> > 
> > Something evil like $program initiates a directio read from a PI disk, a
> > BPF guy starts screaming in a datacenter to wobble the disk, and that
> > gives a compromised systemd enough time to attach to $program with
> > ptrace to unmap a page in the middle of the read buffer before
> > bio_iov_iter_unbounce_read gets called?
> 
> I don't think it can at all.   Remember, this is not directly copying
> to the user iter, but to the bvec array pointing to pinned pages,
> which are not going away. 

Ah, right.  How about adding a comment so that future me doesn't trip on
this again?

	/* copying to pinned pages should always work */
	WARN_ON_ONCE(copy_to_iter(...));

--D

^ permalink raw reply	[flat|nested] 75+ messages in thread

* Re: [PATCH 03/14] iov_iter: extract a iov_iter_extract_bvecs helper from bio code
  2026-01-23  7:09         ` Darrick J. Wong
@ 2026-01-23  7:14           ` Christoph Hellwig
  0 siblings, 0 replies; 75+ messages in thread
From: Christoph Hellwig @ 2026-01-23  7:14 UTC (permalink / raw)
  To: Darrick J. Wong
  Cc: Christoph Hellwig, Jens Axboe, Christian Brauner, Carlos Maiolino,
	Qu Wenruo, Al Viro, linux-block, linux-xfs, linux-fsdevel

On Thu, Jan 22, 2026 at 11:09:33PM -0800, Darrick J. Wong wrote:
> On Fri, Jan 23, 2026 at 06:44:48AM +0100, Christoph Hellwig wrote:
> > On Thu, Jan 22, 2026 at 09:47:03AM -0800, Darrick J. Wong wrote:
> > > > -	struct page **pages = (struct page **)bv;
> > > 
> > > Huh.  We type-abuse an array of bio_vec's as an array of struct page
> > > pointers??
> > > 
> > > As a straight hoist the patch looks correct but I'm confused about this.
> > 
> > Yes.  This uses the larger space allocated for bio_vecs to first
> > place the pages at the end, and then filling in the bio_vecs from the
> > beginning.  I think the comments describe it pretty well, but if you
> > have ideas for enhancement, this might be a good time to update them.
> 
> I'm not sure, since the alternative is to wrap the whole mess in a
> union, which makes the type-abuse more explicit but then is still pretty
> ugly.  The only improvement I can really think of would be a huge
> comment wherever we start this, which I think Kent's original code from
> 2011 had ("deep magic", etc).

I don't think a union would be good here.  It primarily operates on
the bio_vecs embedded in the bio, which certainly should not be a
union.


^ permalink raw reply	[flat|nested] 75+ messages in thread

* Re: [PATCH 05/14] block: add helpers to bounce buffer an iov_iter into bios
  2026-01-23  7:11         ` Darrick J. Wong
@ 2026-01-23  7:16           ` Christoph Hellwig
  0 siblings, 0 replies; 75+ messages in thread
From: Christoph Hellwig @ 2026-01-23  7:16 UTC (permalink / raw)
  To: Darrick J. Wong
  Cc: Christoph Hellwig, Jens Axboe, Christian Brauner, Carlos Maiolino,
	Qu Wenruo, Al Viro, linux-block, linux-xfs, linux-fsdevel

On Thu, Jan 22, 2026 at 11:11:39PM -0800, Darrick J. Wong wrote:
> Ah, right.  How about adding a comment so that future me doesn't trip on
> this again?
> 
> 	/* copying to pinned pages should always work */
> 	WARN_ON_ONCE(copy_to_iter(...));

Done.


^ permalink raw reply	[flat|nested] 75+ messages in thread

* Re: [PATCH 01/14] block: refactor get_contig_folio_len
  2026-01-19  7:44   ` [PATCH 01/14] block: refactor get_contig_folio_len Christoph Hellwig
  2026-01-22 11:00     ` Johannes Thumshirn
  2026-01-22 17:54     ` Darrick J. Wong
@ 2026-01-23  8:32     ` Damien Le Moal
  2026-01-23  8:35       ` Christoph Hellwig
  2026-01-23  8:45     ` Damien Le Moal
  2026-01-23 12:14     ` Anuj Gupta
  4 siblings, 1 reply; 75+ messages in thread
From: Damien Le Moal @ 2026-01-23  8:32 UTC (permalink / raw)
  To: Christoph Hellwig, Jens Axboe, Christian Brauner
  Cc: Darrick J. Wong, Carlos Maiolino, Qu Wenruo, Al Viro, linux-block,
	linux-xfs, linux-fsdevel

On 2026/01/19 18:44, Christoph Hellwig wrote:
> Move all of the logic to find the contigous length inside a folio into
> get_contig_folio_len instead of keeping some of it in the caller.
> 
> Signed-off-by: Christoph Hellwig <hch@lst.de>
> ---
>  block/bio.c | 62 +++++++++++++++++++++++------------------------------
>  1 file changed, 27 insertions(+), 35 deletions(-)
> 
> diff --git a/block/bio.c b/block/bio.c
> index 2359c0723b88..18dfdaba0c73 100644
> --- a/block/bio.c
> +++ b/block/bio.c
> @@ -1172,33 +1172,35 @@ void bio_iov_bvec_set(struct bio *bio, const struct iov_iter *iter)
>  	bio_set_flag(bio, BIO_CLONED);
>  }
>  
> -static unsigned int get_contig_folio_len(unsigned int *num_pages,
> -					 struct page **pages, unsigned int i,
> -					 struct folio *folio, size_t left,
> +static unsigned int get_contig_folio_len(struct page **pages,
> +					 unsigned int *num_pages, size_t left,
>  					 size_t offset)
>  {
> -	size_t bytes = left;
> -	size_t contig_sz = min_t(size_t, PAGE_SIZE - offset, bytes);
> -	unsigned int j;
> +	struct folio *folio = page_folio(pages[0]);
> +	size_t contig_sz = min_t(size_t, PAGE_SIZE - offset, left);
> +	unsigned int max_pages, i;
> +	size_t folio_offset, len;
> +
> +	folio_offset = PAGE_SIZE * folio_page_idx(folio, pages[0]) + offset;

folio_page_idx(folio, pages[0]) is always going to be 0 here, no ?

> +	len = min(folio_size(folio) - folio_offset, left);
>  
>  	/*
> -	 * We might COW a single page in the middle of
> -	 * a large folio, so we have to check that all
> -	 * pages belong to the same folio.
> +	 * We might COW a single page in the middle of a large folio, so we have
> +	 * to check that all pages belong to the same folio.
>  	 */
> -	bytes -= contig_sz;
> -	for (j = i + 1; j < i + *num_pages; j++) {
> -		size_t next = min_t(size_t, PAGE_SIZE, bytes);
> +	left -= contig_sz;
> +	max_pages = DIV_ROUND_UP(offset + len, PAGE_SIZE);
> +	for (i = 1; i < max_pages; i++) {
> +		size_t next = min_t(size_t, PAGE_SIZE, left);
>  
> -		if (page_folio(pages[j]) != folio ||
> -		    pages[j] != pages[j - 1] + 1) {
> +		if (page_folio(pages[i]) != folio ||
> +		    pages[i] != pages[i - 1] + 1)
>  			break;
> -		}
>  		contig_sz += next;
> -		bytes -= next;
> +		left -= next;
>  	}
> -	*num_pages = j - i;
>  
> +	*num_pages = i;
>  	return contig_sz;
>  }
>  
> @@ -1222,8 +1224,8 @@ static int __bio_iov_iter_get_pages(struct bio *bio, struct iov_iter *iter)
>  	struct bio_vec *bv = bio->bi_io_vec + bio->bi_vcnt;
>  	struct page **pages = (struct page **)bv;
>  	ssize_t size;
> -	unsigned int num_pages, i = 0;
> -	size_t offset, folio_offset, left, len;
> +	unsigned int i = 0;
> +	size_t offset, left, len;
>  	int ret = 0;
>  
>  	/*
> @@ -1244,23 +1246,12 @@ static int __bio_iov_iter_get_pages(struct bio *bio, struct iov_iter *iter)
>  		return size ? size : -EFAULT;
>  
>  	nr_pages = DIV_ROUND_UP(offset + size, PAGE_SIZE);
> -	for (left = size, i = 0; left > 0; left -= len, i += num_pages) {
> -		struct page *page = pages[i];
> -		struct folio *folio = page_folio(page);
> +	for (left = size; left > 0; left -= len) {
>  		unsigned int old_vcnt = bio->bi_vcnt;
> +		unsigned int nr_to_add;
>  
> -		folio_offset = ((size_t)folio_page_idx(folio, page) <<
> -			       PAGE_SHIFT) + offset;
> -
> -		len = min(folio_size(folio) - folio_offset, left);
> -
> -		num_pages = DIV_ROUND_UP(offset + len, PAGE_SIZE);
> -
> -		if (num_pages > 1)
> -			len = get_contig_folio_len(&num_pages, pages, i,
> -						   folio, left, offset);
> -
> -		if (!bio_add_folio(bio, folio, len, folio_offset)) {
> +		len = get_contig_folio_len(&pages[i], &nr_to_add, left, offset);
> +		if (!bio_add_page(bio, pages[i], len, offset)) {
>  			WARN_ON_ONCE(1);
>  			ret = -EINVAL;
>  			goto out;
> @@ -1275,8 +1266,9 @@ static int __bio_iov_iter_get_pages(struct bio *bio, struct iov_iter *iter)
>  			 * single pin per page.
>  			 */
>  			if (offset && bio->bi_vcnt == old_vcnt)
> -				unpin_user_folio(folio, 1);
> +				unpin_user_folio(page_folio(pages[i]), 1);
>  		}
> +		i += nr_to_add;
>  		offset = 0;
>  	}
>  


-- 
Damien Le Moal
Western Digital Research

^ permalink raw reply	[flat|nested] 75+ messages in thread

* Re: [PATCH 02/14] block: open code bio_add_page and fix handling of mismatching P2P ranges
  2026-01-19  7:44   ` [PATCH 02/14] block: open code bio_add_page and fix handling of mismatching P2P ranges Christoph Hellwig
  2026-01-22 11:04     ` Johannes Thumshirn
  2026-01-22 17:59     ` Darrick J. Wong
@ 2026-01-23  8:35     ` Damien Le Moal
  2026-01-23 12:15     ` Anuj Gupta
  3 siblings, 0 replies; 75+ messages in thread
From: Damien Le Moal @ 2026-01-23  8:35 UTC (permalink / raw)
  To: Christoph Hellwig, Jens Axboe, Christian Brauner
  Cc: Darrick J. Wong, Carlos Maiolino, Qu Wenruo, Al Viro, linux-block,
	linux-xfs, linux-fsdevel

On 2026/01/19 18:44, Christoph Hellwig wrote:
> bio_add_page fails to add data to the bio when mixing P2P with non-P2P
> ranges, or ranges that map to different P2P providers.  In that case
> it will trigger that WARN_ON and return an error up the chain instead of
> simply starting a new bio as intended.  Fix this by open coding
> bio_add_page and handling this case explicitly.  While doing so, stop
> merging physical contiguous data that belongs to multiple folios.  While
> this merge could lead to more efficient bio packing in some case,
> dropping will allow to remove handling of this corner case in other
> places and make the code more robust.
> 
> Signed-off-by: Christoph Hellwig <hch@lst.de>

Looks OK to me.

Reviewed-by: Damien Le Moal <dlemoal@kernel.org>


-- 
Damien Le Moal
Western Digital Research

^ permalink raw reply	[flat|nested] 75+ messages in thread

* Re: [PATCH 01/14] block: refactor get_contig_folio_len
  2026-01-23  8:32     ` Damien Le Moal
@ 2026-01-23  8:35       ` Christoph Hellwig
  2026-01-23  8:44         ` Damien Le Moal
  0 siblings, 1 reply; 75+ messages in thread
From: Christoph Hellwig @ 2026-01-23  8:35 UTC (permalink / raw)
  To: Damien Le Moal
  Cc: Christoph Hellwig, Jens Axboe, Christian Brauner, Darrick J. Wong,
	Carlos Maiolino, Qu Wenruo, Al Viro, linux-block, linux-xfs,
	linux-fsdevel

On Fri, Jan 23, 2026 at 07:32:04PM +1100, Damien Le Moal wrote:
> > -	unsigned int j;
> > +	struct folio *folio = page_folio(pages[0]);
> > +	size_t contig_sz = min_t(size_t, PAGE_SIZE - offset, left);
> > +	unsigned int max_pages, i;
> > +	size_t folio_offset, len;
> > +
> > +	folio_offset = PAGE_SIZE * folio_page_idx(folio, pages[0]) + offset;
> 
> folio_page_idx(folio, pages[0]) is always going to be 0 here, no ?

No, page could be at an offset into the folio.


^ permalink raw reply	[flat|nested] 75+ messages in thread

* Re: [PATCH 04/14] block: remove bio_release_page
  2026-01-19  7:44   ` [PATCH 04/14] block: remove bio_release_page Christoph Hellwig
  2026-01-22 11:14     ` Johannes Thumshirn
  2026-01-22 17:26     ` Darrick J. Wong
@ 2026-01-23  8:43     ` Damien Le Moal
  2026-01-23 12:17     ` Anuj Gupta
  3 siblings, 0 replies; 75+ messages in thread
From: Damien Le Moal @ 2026-01-23  8:43 UTC (permalink / raw)
  To: Christoph Hellwig, Jens Axboe, Christian Brauner
  Cc: Darrick J. Wong, Carlos Maiolino, Qu Wenruo, Al Viro, linux-block,
	linux-xfs, linux-fsdevel

On 2026/01/19 18:44, Christoph Hellwig wrote:
> Merge bio_release_page into the only remaining caller.
> 
> Signed-off-by: Christoph Hellwig <hch@lst.de>

Reviewed-by: Damien Le Moal <dlemoal@kernel.org>

-- 
Damien Le Moal
Western Digital Research

^ permalink raw reply	[flat|nested] 75+ messages in thread

* Re: [PATCH 01/14] block: refactor get_contig_folio_len
  2026-01-23  8:35       ` Christoph Hellwig
@ 2026-01-23  8:44         ` Damien Le Moal
  0 siblings, 0 replies; 75+ messages in thread
From: Damien Le Moal @ 2026-01-23  8:44 UTC (permalink / raw)
  To: Christoph Hellwig
  Cc: Jens Axboe, Christian Brauner, Darrick J. Wong, Carlos Maiolino,
	Qu Wenruo, Al Viro, linux-block, linux-xfs, linux-fsdevel

On 2026/01/23 19:35, Christoph Hellwig wrote:
> On Fri, Jan 23, 2026 at 07:32:04PM +1100, Damien Le Moal wrote:
>>> -	unsigned int j;
>>> +	struct folio *folio = page_folio(pages[0]);
>>> +	size_t contig_sz = min_t(size_t, PAGE_SIZE - offset, left);
>>> +	unsigned int max_pages, i;
>>> +	size_t folio_offset, len;
>>> +
>>> +	folio_offset = PAGE_SIZE * folio_page_idx(folio, pages[0]) + offset;
>>
>> folio_page_idx(folio, pages[0]) is always going to be 0 here, no ?
> 
> No, page could be at an offset into the folio.

Arg... yes. pages[0] may not be the first page of the compound page...

-- 
Damien Le Moal
Western Digital Research

^ permalink raw reply	[flat|nested] 75+ messages in thread

* Re: [PATCH 01/14] block: refactor get_contig_folio_len
  2026-01-19  7:44   ` [PATCH 01/14] block: refactor get_contig_folio_len Christoph Hellwig
                       ` (2 preceding siblings ...)
  2026-01-23  8:32     ` Damien Le Moal
@ 2026-01-23  8:45     ` Damien Le Moal
  2026-01-23 12:14     ` Anuj Gupta
  4 siblings, 0 replies; 75+ messages in thread
From: Damien Le Moal @ 2026-01-23  8:45 UTC (permalink / raw)
  To: Christoph Hellwig, Jens Axboe, Christian Brauner
  Cc: Darrick J. Wong, Carlos Maiolino, Qu Wenruo, Al Viro, linux-block,
	linux-xfs, linux-fsdevel

On 2026/01/19 18:44, Christoph Hellwig wrote:
> Move all of the logic to find the contigous length inside a folio into
> get_contig_folio_len instead of keeping some of it in the caller.
> 
> Signed-off-by: Christoph Hellwig <hch@lst.de>

Looks OK to me.

Reviewed-by: Damien Le Moal <dlemoal@kernel.org>


-- 
Damien Le Moal
Western Digital Research

^ permalink raw reply	[flat|nested] 75+ messages in thread

* Re: [PATCH 05/14] block: add helpers to bounce buffer an iov_iter into bios
  2026-01-19  7:44   ` [PATCH 05/14] block: add helpers to bounce buffer an iov_iter into bios Christoph Hellwig
  2026-01-22 13:05     ` Johannes Thumshirn
  2026-01-22 17:25     ` Darrick J. Wong
@ 2026-01-23  8:52     ` Damien Le Moal
  2026-01-23 12:20     ` Anuj Gupta
  3 siblings, 0 replies; 75+ messages in thread
From: Damien Le Moal @ 2026-01-23  8:52 UTC (permalink / raw)
  To: Christoph Hellwig, Jens Axboe, Christian Brauner
  Cc: Darrick J. Wong, Carlos Maiolino, Qu Wenruo, Al Viro, linux-block,
	linux-xfs, linux-fsdevel

On 2026/01/19 18:44, Christoph Hellwig wrote:
> Add helpers to implement bounce buffering of data into a bio to implement
> direct I/O for cases where direct user access is not possible because
> stable in-flight data is required.  These are intended to be used as
> easily as bio_iov_iter_get_pages for the zero-copy path.
> 
> The write side is trivial and just copies data into the bounce buffer.
> The read side is a lot more complex because it needs to perform the copy
> from the completion context, and without preserving the iov_iter through
> the call chain.  It steals a trick from the integrity data user interface
> and uses the first vector in the bio for the bounce buffer data that is
> fed to the block I/O stack, and uses the others to record the user
> buffer fragments.
> 
> Signed-off-by: Christoph Hellwig <hch@lst.de>

Looks OK to me, modulo one nit below.

Reviewed-by: Damien Le Moal <dlemoal@kernel.org>


> +	/*
> +	 * Set the folio directly here.  The above loop has already calculated
> +	 * the correct bi_size, and we use bi_vcnt for the user buffers.  That
> +	 * is safe as bi_vcnt is only for user by the submitter and not looked

s/for user/for use

> +	 * at by the actual I/O path.
> +	 */
> +	bvec_set_folio(&bio->bi_io_vec[0], folio, bio->bi_iter.bi_size, 0);
> +	if (iov_iter_extract_will_pin(iter))
> +		bio_set_flag(bio, BIO_PAGE_PINNED);
> +	return 0;
> +}



-- 
Damien Le Moal
Western Digital Research

^ permalink raw reply	[flat|nested] 75+ messages in thread

* Re: [PATCH 06/14] iomap: fix submission side handling of completion side errors
  2026-01-19  7:44   ` [PATCH 06/14] iomap: fix submission side handling of completion side errors Christoph Hellwig
  2026-01-19 17:40     ` Darrick J. Wong
@ 2026-01-23  8:54     ` Damien Le Moal
  1 sibling, 0 replies; 75+ messages in thread
From: Damien Le Moal @ 2026-01-23  8:54 UTC (permalink / raw)
  To: Christoph Hellwig, Jens Axboe, Christian Brauner
  Cc: Darrick J. Wong, Carlos Maiolino, Qu Wenruo, Al Viro, linux-block,
	linux-xfs, linux-fsdevel

On 2026/01/19 18:44, Christoph Hellwig wrote:
> The "if (dio->error)" in iomap_dio_bio_iter exists to stop submitting
> more bios when a completion already return an error.  Commit cfe057f7db1f
> ("iomap_dio_actor(): fix iov_iter bugs") made it revert the iov by
> "copied", which is very wrong given that we've already consumed that
> range and submitted a bio for it.
> 
> Fixes: cfe057f7db1f ("iomap_dio_actor(): fix iov_iter bugs")
> Signed-off-by: Christoph Hellwig <hch@lst.de>

Looks OK to me.

Reviewed-by: Damien Le Moal <dlemoal@kernel.org>


-- 
Damien Le Moal
Western Digital Research

^ permalink raw reply	[flat|nested] 75+ messages in thread

* Re: [PATCH 07/14] iomap: simplify iomap_dio_bio_iter
  2026-01-19  7:44   ` [PATCH 07/14] iomap: simplify iomap_dio_bio_iter Christoph Hellwig
  2026-01-19 17:43     ` Darrick J. Wong
@ 2026-01-23  8:55     ` Damien Le Moal
  1 sibling, 0 replies; 75+ messages in thread
From: Damien Le Moal @ 2026-01-23  8:55 UTC (permalink / raw)
  To: Christoph Hellwig, Jens Axboe, Christian Brauner
  Cc: Darrick J. Wong, Carlos Maiolino, Qu Wenruo, Al Viro, linux-block,
	linux-xfs, linux-fsdevel

On 2026/01/19 18:44, Christoph Hellwig wrote:
> Use iov_iter_count to check if we need to continue as that just reads
> a field in the iov_iter, and only use bio_iov_vecs_to_alloc to calculate
> the actual number of vectors to allocate for the bio.
> 
> Signed-off-by: Christoph Hellwig <hch@lst.de>

Reviewed-by: Damien Le Moal <dlemoal@kernel.org>

-- 
Damien Le Moal
Western Digital Research

^ permalink raw reply	[flat|nested] 75+ messages in thread

* Re: [PATCH 08/14] iomap: split out the per-bio logic from iomap_dio_bio_iter
  2026-01-19  7:44   ` [PATCH 08/14] iomap: split out the per-bio logic from iomap_dio_bio_iter Christoph Hellwig
@ 2026-01-23  8:57     ` Damien Le Moal
  0 siblings, 0 replies; 75+ messages in thread
From: Damien Le Moal @ 2026-01-23  8:57 UTC (permalink / raw)
  To: Christoph Hellwig, Jens Axboe, Christian Brauner
  Cc: Darrick J. Wong, Carlos Maiolino, Qu Wenruo, Al Viro, linux-block,
	linux-xfs, linux-fsdevel

On 2026/01/19 18:44, Christoph Hellwig wrote:
> Factor out a separate helper that builds and submits a single bio.
> 
> Signed-off-by: Christoph Hellwig <hch@lst.de>
> Reviewed-by: "Darrick J. Wong" <djwong@kernel.org>

Reviewed-by: Damien Le Moal <dlemoal@kernel.org>


-- 
Damien Le Moal
Western Digital Research

^ permalink raw reply	[flat|nested] 75+ messages in thread

* Re: [PATCH 09/14] iomap: share code between iomap_dio_bio_end_io and iomap_finish_ioend_direct
  2026-01-19  7:44   ` [PATCH 09/14] iomap: share code between iomap_dio_bio_end_io and iomap_finish_ioend_direct Christoph Hellwig
@ 2026-01-23  8:58     ` Damien Le Moal
  0 siblings, 0 replies; 75+ messages in thread
From: Damien Le Moal @ 2026-01-23  8:58 UTC (permalink / raw)
  To: Christoph Hellwig, Jens Axboe, Christian Brauner
  Cc: Darrick J. Wong, Carlos Maiolino, Qu Wenruo, Al Viro, linux-block,
	linux-xfs, linux-fsdevel

On 2026/01/19 18:44, Christoph Hellwig wrote:
> Refactor the two per-bio completion handlers to share common code using
> a new helper.
> 
> Signed-off-by: Christoph Hellwig <hch@lst.de>
> Reviewed-by: "Darrick J. Wong" <djwong@kernel.org>

Reviewed-by: Damien Le Moal <dlemoal@kernel.org>


-- 
Damien Le Moal
Western Digital Research

^ permalink raw reply	[flat|nested] 75+ messages in thread

* Re: [PATCH 10/14] iomap: free the bio before completing the dio
  2026-01-19  7:44   ` [PATCH 10/14] iomap: free the bio before completing the dio Christoph Hellwig
  2026-01-19 17:43     ` Darrick J. Wong
@ 2026-01-23  8:59     ` Damien Le Moal
  1 sibling, 0 replies; 75+ messages in thread
From: Damien Le Moal @ 2026-01-23  8:59 UTC (permalink / raw)
  To: Christoph Hellwig, Jens Axboe, Christian Brauner
  Cc: Darrick J. Wong, Carlos Maiolino, Qu Wenruo, Al Viro, linux-block,
	linux-xfs, linux-fsdevel

On 2026/01/19 18:44, Christoph Hellwig wrote:
> There are good arguments for processing the user completions ASAP vs.
> freeing resources ASAP, but freeing the bio first here removes potential
> use after free hazards when checking flags, and will simplify the
> upcoming bounce buffer support.
> 
> Signed-off-by: Christoph Hellwig <hch@lst.de>

Looks OK to me.

Reviewed-by: Damien Le Moal <dlemoal@kernel.org>


-- 
Damien Le Moal
Western Digital Research

^ permalink raw reply	[flat|nested] 75+ messages in thread

* Re: [PATCH 11/14] iomap: rename IOMAP_DIO_DIRTY to IOMAP_DIO_USER_BACKED
  2026-01-19  7:44   ` [PATCH 11/14] iomap: rename IOMAP_DIO_DIRTY to IOMAP_DIO_USER_BACKED Christoph Hellwig
@ 2026-01-23  9:00     ` Damien Le Moal
  0 siblings, 0 replies; 75+ messages in thread
From: Damien Le Moal @ 2026-01-23  9:00 UTC (permalink / raw)
  To: Christoph Hellwig, Jens Axboe, Christian Brauner
  Cc: Darrick J. Wong, Carlos Maiolino, Qu Wenruo, Al Viro, linux-block,
	linux-xfs, linux-fsdevel

On 2026/01/19 18:44, Christoph Hellwig wrote:
> Match the more descriptive iov_iter terminology instead of encoding
> what we do with them for reads only.
> 
> Signed-off-by: Christoph Hellwig <hch@lst.de>
> Reviewed-by: "Darrick J. Wong" <djwong@kernel.org>

Reviewed-by: Damien Le Moal <dlemoal@kernel.org>


-- 
Damien Le Moal
Western Digital Research

^ permalink raw reply	[flat|nested] 75+ messages in thread

* Re: [PATCH 12/14] iomap: support ioends for direct reads
  2026-01-19  7:44   ` [PATCH 12/14] iomap: support ioends for direct reads Christoph Hellwig
@ 2026-01-23  9:02     ` Damien Le Moal
  0 siblings, 0 replies; 75+ messages in thread
From: Damien Le Moal @ 2026-01-23  9:02 UTC (permalink / raw)
  To: Christoph Hellwig, Jens Axboe, Christian Brauner
  Cc: Darrick J. Wong, Carlos Maiolino, Qu Wenruo, Al Viro, linux-block,
	linux-xfs, linux-fsdevel

On 2026/01/19 18:44, Christoph Hellwig wrote:
> Support using the ioend structure to defer I/O completion for direct
> reads in addition to writes.  This requires a check for the operation
> to not merge reads and writes in iomap_ioend_can_merge.  This support
> will be used for bounce buffered direct I/O reads that need to copy
> data back to the user address space on read completion.
> 
> Signed-off-by: Christoph Hellwig <hch@lst.de>
> Reviewed-by: "Darrick J. Wong" <djwong@kernel.org>

Reviewed-by: Damien Le Moal <dlemoal@kernel.org>


-- 
Damien Le Moal
Western Digital Research

^ permalink raw reply	[flat|nested] 75+ messages in thread

* Re: [PATCH 13/14] iomap: add a flag to bounce buffer direct I/O
  2026-01-19  7:44   ` [PATCH 13/14] iomap: add a flag to bounce buffer direct I/O Christoph Hellwig
@ 2026-01-23  9:05     ` Damien Le Moal
  0 siblings, 0 replies; 75+ messages in thread
From: Damien Le Moal @ 2026-01-23  9:05 UTC (permalink / raw)
  To: Christoph Hellwig, Jens Axboe, Christian Brauner
  Cc: Darrick J. Wong, Carlos Maiolino, Qu Wenruo, Al Viro, linux-block,
	linux-xfs, linux-fsdevel

On 2026/01/19 18:44, Christoph Hellwig wrote:
> Add a new flag that request bounce buffering for direct I/O.  This is
> needed to provide the stable pages requirement requested by devices
> that need to calculate checksums or parity over the data and allows
> file systems to properly work with things like T10 protection
> information.  The implementation just calls out to the new bio bounce
> buffering helpers to allocate a bounce buffer, which is used for
> I/O and to copy to/from it.
> 
> Signed-off-by: Christoph Hellwig <hch@lst.de>
> Reviewed-by: "Darrick J. Wong" <djwong@kernel.org>

Reviewed-by: Damien Le Moal <dlemoal@kernel.org>


-- 
Damien Le Moal
Western Digital Research

^ permalink raw reply	[flat|nested] 75+ messages in thread

* Re: [PATCH 14/14] xfs: use bounce buffering direct I/O when the device requires stable pages
  2026-01-19  7:44   ` [PATCH 14/14] xfs: use bounce buffering direct I/O when the device requires stable pages Christoph Hellwig
  2026-01-19 17:45     ` Darrick J. Wong
@ 2026-01-23  9:08     ` Damien Le Moal
  1 sibling, 0 replies; 75+ messages in thread
From: Damien Le Moal @ 2026-01-23  9:08 UTC (permalink / raw)
  To: Christoph Hellwig, Jens Axboe, Christian Brauner
  Cc: Darrick J. Wong, Carlos Maiolino, Qu Wenruo, Al Viro, linux-block,
	linux-xfs, linux-fsdevel

On 2026/01/19 18:44, Christoph Hellwig wrote:
> Fix direct I/O on devices that require stable pages by asking iomap
> to bounce buffer.  To support this, ioends are used for direct reads
> in this case to provide a user context for copying data back from the
> bounce buffer.
> 
> This fixes qemu when used on devices using T10 protection information
> and probably other cases like iSCSI using data digests.
> 
> Signed-off-by: Christoph Hellwig <hch@lst.de>

Looks OK to me.

Reviewed-by: Damien Le Moal <dlemoal@kernel.org>


-- 
Damien Le Moal
Western Digital Research

^ permalink raw reply	[flat|nested] 75+ messages in thread

* Re: [PATCH 03/14] iov_iter: extract a iov_iter_extract_bvecs helper from bio code
  2026-01-19  7:44   ` [PATCH 03/14] iov_iter: extract a iov_iter_extract_bvecs helper from bio code Christoph Hellwig
  2026-01-22 17:47     ` Darrick J. Wong
@ 2026-01-23 11:37     ` David Howells
  2026-01-23 13:58       ` Christoph Hellwig
  1 sibling, 1 reply; 75+ messages in thread
From: David Howells @ 2026-01-23 11:37 UTC (permalink / raw)
  To: Christoph Hellwig
  Cc: dhowells, Jens Axboe, Christian Brauner, Darrick J. Wong,
	Carlos Maiolino, Qu Wenruo, Al Viro, linux-block, linux-xfs,
	linux-fsdevel

Christoph Hellwig <hch@lst.de> wrote:

> +static unsigned int get_contig_folio_len(struct page **pages,
> +		unsigned int *num_pages, size_t left, size_t offset)
> +{
> +	struct folio *folio = page_folio(pages[0]);

You can't do this.  You cannot assume that pages[0] is of folio type.
vmsplice() is unfortunately a thing and the page could be a network read
buffer.

David


^ permalink raw reply	[flat|nested] 75+ messages in thread

* Re: bounce buffer direct I/O when stable pages are required v2
  2026-01-19  7:44 ` bounce buffer direct I/O when stable pages are required v2 Christoph Hellwig
                     ` (13 preceding siblings ...)
  2026-01-19  7:44   ` [PATCH 14/14] xfs: use bounce buffering direct I/O when the device requires stable pages Christoph Hellwig
@ 2026-01-23 12:10   ` Anuj Gupta
  2026-01-23 14:01     ` Christoph Hellwig
  2026-01-23 14:09     ` Keith Busch
  2026-01-23 12:24   ` Christian Brauner
  15 siblings, 2 replies; 75+ messages in thread
From: Anuj Gupta @ 2026-01-23 12:10 UTC (permalink / raw)
  To: Christoph Hellwig
  Cc: Jens Axboe, Christian Brauner, Darrick J. Wong, Carlos Maiolino,
	Qu Wenruo, Al Viro, linux-block, linux-xfs, linux-fsdevel

[-- Attachment #1: Type: text/plain, Size: 2043 bytes --]

I ran experiments[1] on two devices - Samsung PM1733 and Intel Optane
with PI enabled (4K + 8b format). On my setup, I didn't observe any
noticeable difference for sequential write workloads. Sequential reads,
however, show a clear performance drop while using bounce buffering,
which is expected.
Used these fio commands listed below[2]

Feel free to add:
Tested-by: Anuj Gupta <anuj20.g@samsung.com

[1]
Intel Optane:

Sequential write
   | size | zero copy  |  bounce    | 
   +------+------------+------------+
   |   4k | 158MiB/s   | 161MiB/s   |
   |  64K | 4522MiB/s  | 4506MiB/s  |
   |   1M | 4573MiB/s  | 4571MiB/s  |
   +------+-------------------------+

Sequential read
   | size | zero copy  |  bounce    | 
   +------+------------+------------+
   |   4k | 1693MiB/s  | 1245MiB/s  |
   |  64K | 6518MiB/s  | 4763MiB/s  |
   |   1M | 6731MiB/s  | 5475MiB/s  |
   +------+-------------------------+
   
   
For Samsung PM1733:

Sequential write
   | size | zero copy  |  bounce    | 
   +------+------------+------------+
   |   4k | 155MiB/s   | 153MiB/s   |
   |  64K | 3899MiB/s  | 3868MiB/s  |
   |   1M | 4117MiB/s  | 4116MiB/s  |
   +------+-------------------------+

Sequential read
   | size | zero copy  |  bounce    | 
   +------+------------+------------+
   |   4k | 602MiB/s   | 244MiB/s  |
   |  64K | 4613MiB/s  | 2141MiB/s  |
   |   1M | 5868MiB/s  | 5162MiB/s  |
   +------+-------------------------+
   

[2]
Write benchmark -
fio --name=write_new_4k --filename=/mnt/writefile --rw=write --bs=4k --size=20G --ioengine=io_uring --direct=1 --iodepth=16 --numjobs=1 --time_based=1 --runtime=30 --group_reporting

Read benchmark -
Prepare the file:
fio --name=prep_create_prepfile --filename=/mnt/prepfile --rw=write --bs=1M --size=20G --ioengine=io_uring --direct=1 --iodepth=16 --numjobs=1 --group_reporting

Then run the read workload:
fio --name=read_4k --filename=/mnt/prepfile --rw=read --bs=4k --size=20G --ioengine=io_uring --direct=1 --iodepth=16 --numjobs=1 --time_based=1 --runtime=30 --group_reporting

[-- Attachment #2: Type: text/plain, Size: 0 bytes --]



^ permalink raw reply	[flat|nested] 75+ messages in thread

* Re: [PATCH 01/14] block: refactor get_contig_folio_len
  2026-01-19  7:44   ` [PATCH 01/14] block: refactor get_contig_folio_len Christoph Hellwig
                       ` (3 preceding siblings ...)
  2026-01-23  8:45     ` Damien Le Moal
@ 2026-01-23 12:14     ` Anuj Gupta
  4 siblings, 0 replies; 75+ messages in thread
From: Anuj Gupta @ 2026-01-23 12:14 UTC (permalink / raw)
  To: Christoph Hellwig
  Cc: Jens Axboe, Christian Brauner, Darrick J. Wong, Carlos Maiolino,
	Qu Wenruo, Al Viro, linux-block, linux-xfs, linux-fsdevel

[-- Attachment #1: Type: text/plain, Size: 281 bytes --]

On 19/01/26 08:44AM, Christoph Hellwig wrote:
>Move all of the logic to find the contigous length inside a folio into
>get_contig_folio_len instead of keeping some of it in the caller.
>
>Signed-off-by: Christoph Hellwig <hch@lst.de>

Reviewed-by: Anuj Gupta <anuj20.g@samsung.com

[-- Attachment #2: Type: text/plain, Size: 0 bytes --]



^ permalink raw reply	[flat|nested] 75+ messages in thread

* Re: [PATCH 02/14] block: open code bio_add_page and fix handling of mismatching P2P ranges
  2026-01-19  7:44   ` [PATCH 02/14] block: open code bio_add_page and fix handling of mismatching P2P ranges Christoph Hellwig
                       ` (2 preceding siblings ...)
  2026-01-23  8:35     ` Damien Le Moal
@ 2026-01-23 12:15     ` Anuj Gupta
  3 siblings, 0 replies; 75+ messages in thread
From: Anuj Gupta @ 2026-01-23 12:15 UTC (permalink / raw)
  To: Christoph Hellwig
  Cc: Jens Axboe, Christian Brauner, Darrick J. Wong, Carlos Maiolino,
	Qu Wenruo, Al Viro, linux-block, linux-xfs, linux-fsdevel

[-- Attachment #1: Type: text/plain, Size: 743 bytes --]

On 19/01/26 08:44AM, Christoph Hellwig wrote:
>bio_add_page fails to add data to the bio when mixing P2P with non-P2P
>ranges, or ranges that map to different P2P providers.  In that case
>it will trigger that WARN_ON and return an error up the chain instead of
>simply starting a new bio as intended.  Fix this by open coding
>bio_add_page and handling this case explicitly.  While doing so, stop
>merging physical contiguous data that belongs to multiple folios.  While
>this merge could lead to more efficient bio packing in some case,
>dropping will allow to remove handling of this corner case in other
>places and make the code more robust.
>
>Signed-off-by: Christoph Hellwig <hch@lst.de>

Reviewed-by: Anuj Gupta <anuj20.g@samsung.com

[-- Attachment #2: Type: text/plain, Size: 0 bytes --]



^ permalink raw reply	[flat|nested] 75+ messages in thread

* Re: [PATCH 04/14] block: remove bio_release_page
  2026-01-19  7:44   ` [PATCH 04/14] block: remove bio_release_page Christoph Hellwig
                       ` (2 preceding siblings ...)
  2026-01-23  8:43     ` Damien Le Moal
@ 2026-01-23 12:17     ` Anuj Gupta
  3 siblings, 0 replies; 75+ messages in thread
From: Anuj Gupta @ 2026-01-23 12:17 UTC (permalink / raw)
  To: Christoph Hellwig
  Cc: Jens Axboe, Christian Brauner, Darrick J. Wong, Carlos Maiolino,
	Qu Wenruo, Al Viro, linux-block, linux-xfs, linux-fsdevel

[-- Attachment #1: Type: text/plain, Size: 198 bytes --]

On 19/01/26 08:44AM, Christoph Hellwig wrote:
>Merge bio_release_page into the only remaining caller.
>
>Signed-off-by: Christoph Hellwig <hch@lst.de>

Reviewed-by: Anuj Gupta <anuj20.g@samsung.com

[-- Attachment #2: Type: text/plain, Size: 0 bytes --]



^ permalink raw reply	[flat|nested] 75+ messages in thread

* Re: [PATCH 05/14] block: add helpers to bounce buffer an iov_iter into bios
  2026-01-19  7:44   ` [PATCH 05/14] block: add helpers to bounce buffer an iov_iter into bios Christoph Hellwig
                       ` (2 preceding siblings ...)
  2026-01-23  8:52     ` Damien Le Moal
@ 2026-01-23 12:20     ` Anuj Gupta
  3 siblings, 0 replies; 75+ messages in thread
From: Anuj Gupta @ 2026-01-23 12:20 UTC (permalink / raw)
  To: Christoph Hellwig
  Cc: Jens Axboe, Christian Brauner, Darrick J. Wong, Carlos Maiolino,
	Qu Wenruo, Al Viro, linux-block, linux-xfs, linux-fsdevel

[-- Attachment #1: Type: text/plain, Size: 873 bytes --]

On 19/01/26 08:44AM, Christoph Hellwig wrote:
>Add helpers to implement bounce buffering of data into a bio to implement
>direct I/O for cases where direct user access is not possible because
>stable in-flight data is required.  These are intended to be used as
>easily as bio_iov_iter_get_pages for the zero-copy path.
>
>The write side is trivial and just copies data into the bounce buffer.
>The read side is a lot more complex because it needs to perform the copy
>from the completion context, and without preserving the iov_iter through
>the call chain.  It steals a trick from the integrity data user interface
>and uses the first vector in the bio for the bounce buffer data that is
>fed to the block I/O stack, and uses the others to record the user
>buffer fragments.
>
>Signed-off-by: Christoph Hellwig <hch@lst.de>

Reviewed-by: Anuj Gupta <anuj20.g@samsung.com

[-- Attachment #2: Type: text/plain, Size: 0 bytes --]



^ permalink raw reply	[flat|nested] 75+ messages in thread

* Re: bounce buffer direct I/O when stable pages are required v2
  2026-01-19  7:44 ` bounce buffer direct I/O when stable pages are required v2 Christoph Hellwig
                     ` (14 preceding siblings ...)
  2026-01-23 12:10   ` bounce buffer direct I/O when stable pages are required v2 Anuj Gupta
@ 2026-01-23 12:24   ` Christian Brauner
  2026-01-23 14:10     ` block or iomap tree, was: " Christoph Hellwig
  15 siblings, 1 reply; 75+ messages in thread
From: Christian Brauner @ 2026-01-23 12:24 UTC (permalink / raw)
  To: Christoph Hellwig
  Cc: Christian Brauner, Darrick J. Wong, Carlos Maiolino, Qu Wenruo,
	Al Viro, linux-block, linux-xfs, linux-fsdevel, Jens Axboe

On Mon, 19 Jan 2026 08:44:07 +0100, Christoph Hellwig wrote:
> this series tries to address the problem that under I/O pages can be
> modified during direct I/O, even when the device or file system require
> stable pages during I/O to calculate checksums, parity or data
> operations.  It does so by adding block layer helpers to bounce buffer
> an iov_iter into a bio, then wires that up in iomap and ultimately
> XFS.
> 
> [...]

Applied to the vfs-7.0.iomap branch of the vfs/vfs.git tree.
Patches in the vfs-7.0.iomap branch should appear in linux-next soon.

Please report any outstanding bugs that were missed during review in a
new review to the original patch series allowing us to drop it.

It's encouraged to provide Acked-bys and Reviewed-bys even though the
patch has now been applied. If possible patch trailers will be updated.

Note that commit hashes shown below are subject to change due to rebase,
trailer updates or similar. If in doubt, please check the listed branch.

tree:   https://git.kernel.org/pub/scm/linux/kernel/git/vfs/vfs.git
branch: vfs-7.0.iomap

[01/14] block: refactor get_contig_folio_len
        https://git.kernel.org/vfs/vfs/c/81b30a454966
[02/14] block: open code bio_add_page and fix handling of mismatching P2P ranges
        https://git.kernel.org/vfs/vfs/c/447ca020a401
[03/14] iov_iter: extract a iov_iter_extract_bvecs helper from bio code
        https://git.kernel.org/vfs/vfs/c/1f0b577cd289
[04/14] block: remove bio_release_page
        https://git.kernel.org/vfs/vfs/c/8422e6bde5c1
[05/14] block: add helpers to bounce buffer an iov_iter into bios
        https://git.kernel.org/vfs/vfs/c/cec1a583be7b
[06/14] iomap: fix submission side handling of completion side errors
        https://git.kernel.org/vfs/vfs/c/006966526be1
[07/14] iomap: simplify iomap_dio_bio_iter
        https://git.kernel.org/vfs/vfs/c/87226227f1bc
[08/14] iomap: split out the per-bio logic from iomap_dio_bio_iter
        https://git.kernel.org/vfs/vfs/c/d9e65abb3c1b
[09/14] iomap: share code between iomap_dio_bio_end_io and iomap_finish_ioend_direct
        https://git.kernel.org/vfs/vfs/c/eb1620aac3ed
[10/14] iomap: free the bio before completing the dio
        https://git.kernel.org/vfs/vfs/c/dd6c37c1e1bf
[11/14] iomap: rename IOMAP_DIO_DIRTY to IOMAP_DIO_USER_BACKED
        https://git.kernel.org/vfs/vfs/c/ee377c08560c
[12/14] iomap: support ioends for direct reads
        https://git.kernel.org/vfs/vfs/c/3bcca2b5d53b
[13/14] iomap: add a flag to bounce buffer direct I/O
        https://git.kernel.org/vfs/vfs/c/dcc3a3452079
[14/14] xfs: use bounce buffering direct I/O when the device requires stable pages
        https://git.kernel.org/vfs/vfs/c/387bea142297

^ permalink raw reply	[flat|nested] 75+ messages in thread

* Re: [PATCH 03/14] iov_iter: extract a iov_iter_extract_bvecs helper from bio code
  2026-01-23 11:37     ` David Howells
@ 2026-01-23 13:58       ` Christoph Hellwig
  2026-01-23 14:57         ` David Howells
  0 siblings, 1 reply; 75+ messages in thread
From: Christoph Hellwig @ 2026-01-23 13:58 UTC (permalink / raw)
  To: David Howells
  Cc: Christoph Hellwig, Jens Axboe, Christian Brauner, Darrick J. Wong,
	Carlos Maiolino, Qu Wenruo, Al Viro, linux-block, linux-xfs,
	linux-fsdevel, Kundan Kumar, Matthew Wilcox

On Fri, Jan 23, 2026 at 11:37:17AM +0000, David Howells wrote:
> Christoph Hellwig <hch@lst.de> wrote:
> 
> > +static unsigned int get_contig_folio_len(struct page **pages,
> > +		unsigned int *num_pages, size_t left, size_t offset)
> > +{
> > +	struct folio *folio = page_folio(pages[0]);
> 
> You can't do this.  You cannot assume that pages[0] is of folio type.
> vmsplice() is unfortunately a thing and the page could be a network read
> buffer.

Hmm, this just moves around existing code added in commit ed9832bc08db
("block: introduce folio awareness and add a bigger size from folio").

How do we get these network read buffers into either a user address
space or a (non-bvec) iter passed to O_DIRECT reads/writes?

Can we come up with testcase for xfstests or blktests for this?

How do we find out if a given page is a folio and that we can do this?

^ permalink raw reply	[flat|nested] 75+ messages in thread

* Re: bounce buffer direct I/O when stable pages are required v2
  2026-01-23 12:10   ` bounce buffer direct I/O when stable pages are required v2 Anuj Gupta
@ 2026-01-23 14:01     ` Christoph Hellwig
  2026-01-23 14:09     ` Keith Busch
  1 sibling, 0 replies; 75+ messages in thread
From: Christoph Hellwig @ 2026-01-23 14:01 UTC (permalink / raw)
  To: Anuj Gupta
  Cc: Christoph Hellwig, Jens Axboe, Christian Brauner, Darrick J. Wong,
	Carlos Maiolino, Qu Wenruo, Al Viro, linux-block, linux-xfs,
	linux-fsdevel

On Fri, Jan 23, 2026 at 05:40:26PM +0530, Anuj Gupta wrote:
> I ran experiments[1] on two devices - Samsung PM1733 and Intel Optane
> with PI enabled (4K + 8b format).

Thanks for the additional benchmarking on more beefy devices!

> On my setup, I didn't observe any
> noticeable difference for sequential write workloads. Sequential reads,
> however, show a clear performance drop while using bounce buffering,
> which is expected.
> Used these fio commands listed below[2]
>
> Feel free to add:
> Tested-by: Anuj Gupta <anuj20.g@samsung.com


^ permalink raw reply	[flat|nested] 75+ messages in thread

* Re: bounce buffer direct I/O when stable pages are required v2
  2026-01-23 12:10   ` bounce buffer direct I/O when stable pages are required v2 Anuj Gupta
  2026-01-23 14:01     ` Christoph Hellwig
@ 2026-01-23 14:09     ` Keith Busch
  1 sibling, 0 replies; 75+ messages in thread
From: Keith Busch @ 2026-01-23 14:09 UTC (permalink / raw)
  To: Anuj Gupta, t
  Cc: Christoph Hellwig, Jens Axboe, Christian Brauner, Darrick J. Wong,
	Carlos Maiolino, Qu Wenruo, Al Viro, linux-block, linux-xfs,
	linux-fsdevel

On Fri, Jan 23, 2026 at 05:40:26PM +0530, Anuj Gupta wrote:
> Sequential read
>   | size | zero copy  |  bounce    |   +------+------------+------------+
>   |   4k | 1693MiB/s  | 1245MiB/s  |
>   |  64K | 6518MiB/s  | 4763MiB/s  |
>   |   1M | 6731MiB/s  | 5475MiB/s  |
>   +------+-------------------------+
> For Samsung PM1733:
> 
> Sequential write
>   | size | zero copy  |  bounce    |   +------+------------+------------+
>   |   4k | 155MiB/s   | 153MiB/s   |
>   |  64K | 3899MiB/s  | 3868MiB/s  |
>   |   1M | 4117MiB/s  | 4116MiB/s  |
>   +------+-------------------------+
> 
> Sequential read
>   | size | zero copy  |  bounce    |   +------+------------+------------+
>   |   4k | 602MiB/s   | 244MiB/s  |
>   |  64K | 4613MiB/s  | 2141MiB/s  |
>   |   1M | 5868MiB/s  | 5162MiB/s  |
>   +------+-------------------------+
> 
> [2]
> Write benchmark -
> fio --name=write_new_4k --filename=/mnt/writefile --rw=write --bs=4k --size=20G --ioengine=io_uring --direct=1 --iodepth=16 --numjobs=1 --time_based=1 --runtime=30 --group_reporting
> 
> Read benchmark -
> Prepare the file:
> fio --name=prep_create_prepfile --filename=/mnt/prepfile --rw=write --bs=1M --size=20G --ioengine=io_uring --direct=1 --iodepth=16 --numjobs=1 --group_reporting
> 
> Then run the read workload:
> fio --name=read_4k --filename=/mnt/prepfile --rw=read --bs=4k --size=20G --ioengine=io_uring --direct=1 --iodepth=16 --numjobs=1 --time_based=1 --runtime=30 --group_reporting

For this change, I think it'd be more meaningful to report latency
rather than throughput. Can you try a QD1 workload instead?

^ permalink raw reply	[flat|nested] 75+ messages in thread

* block or iomap tree, was: Re: bounce buffer direct I/O when stable pages are required v2
  2026-01-23 12:24   ` Christian Brauner
@ 2026-01-23 14:10     ` Christoph Hellwig
  2026-01-27 10:31       ` Christian Brauner
  0 siblings, 1 reply; 75+ messages in thread
From: Christoph Hellwig @ 2026-01-23 14:10 UTC (permalink / raw)
  To: Christian Brauner, Jens Axboe
  Cc: Christoph Hellwig, Darrick J. Wong, Carlos Maiolino, Qu Wenruo,
	Al Viro, linux-block, linux-xfs, linux-fsdevel

On Fri, Jan 23, 2026 at 01:24:08PM +0100, Christian Brauner wrote:
> Applied to the vfs-7.0.iomap branch of the vfs/vfs.git tree.
> Patches in the vfs-7.0.iomap branch should appear in linux-next soon.

Hmm, I have another minor revision in the making.  This is mostly
spelling fixes, removing a duplicate page_folio call, adding a new
comment and adding symbolic constants for the max bvec_iter/bio sizes.

I also have some other work that would conflict with this in the block
layer.

What do you and Jens think of waiting for another quick respin and
merging it through the block tree or a shared branch in the block
tree?  There really is nothing in the iomap branch that conflicts,
and nothing really coming up that I can think of.

My current state as of this morning is here:

https://git.infradead.org/?p=users/hch/misc.git;a=shortlog;h=refs/heads/iomap-bounce

I think there's another spelling fix pending from the more recent
reviews since then.

^ permalink raw reply	[flat|nested] 75+ messages in thread

* Re: [PATCH 03/14] iov_iter: extract a iov_iter_extract_bvecs helper from bio code
  2026-01-23 13:58       ` Christoph Hellwig
@ 2026-01-23 14:57         ` David Howells
  2026-01-26 17:36           ` Matthew Wilcox
                             ` (2 more replies)
  0 siblings, 3 replies; 75+ messages in thread
From: David Howells @ 2026-01-23 14:57 UTC (permalink / raw)
  To: Christoph Hellwig
  Cc: dhowells, Jens Axboe, Christian Brauner, Darrick J. Wong,
	Carlos Maiolino, Qu Wenruo, Al Viro, linux-block, linux-xfs,
	linux-fsdevel, Kundan Kumar, Matthew Wilcox

Christoph Hellwig <hch@lst.de> wrote:

> On Fri, Jan 23, 2026 at 11:37:17AM +0000, David Howells wrote:
> > Christoph Hellwig <hch@lst.de> wrote:
> > 
> > > +static unsigned int get_contig_folio_len(struct page **pages,
> > > +		unsigned int *num_pages, size_t left, size_t offset)
> > > +{
> > > +	struct folio *folio = page_folio(pages[0]);
> > 
> > You can't do this.  You cannot assume that pages[0] is of folio type.
> > vmsplice() is unfortunately a thing and the page could be a network read
> > buffer.
> 
> Hmm, this just moves around existing code added in commit ed9832bc08db
> ("block: introduce folio awareness and add a bigger size from folio").
> 
> How do we get these network read buffers into either a user address
> space or a (non-bvec) iter passed to O_DIRECT reads/writes?

Splice from TCP socket to pipe, vmsplice from there into process address
space; DIO write() from there I think should do it.

What you might need to do is write page-sized chunks into one end of the TCP
socket and flush it after each one so that vmsplice() sees page-sized chunks
of data.  I'm not sure how well an external connection would work to get
actual transmission buffers.  The problem is that the received packet is
page-aligned, including the network headers (I think), so if you can, say,
send 8K packets, you'd have to try and guess where the page boundaries are as
vmsplice can only work on whole pages.

Can we make vmsplice() just copy data?

> Can we come up with testcase for xfstests or blktests for this?
> 
> How do we find out if a given page is a folio and that we can do this?

That's a question for Willy.

David

^ permalink raw reply	[flat|nested] 75+ messages in thread

* Re: [PATCH 03/14] iov_iter: extract a iov_iter_extract_bvecs helper from bio code
  2026-01-23 14:57         ` David Howells
@ 2026-01-26 17:36           ` Matthew Wilcox
  2026-01-27  5:13             ` Christoph Hellwig
  2026-02-03  8:20           ` Askar Safin
  2026-02-03 10:28           ` Askar Safin
  2 siblings, 1 reply; 75+ messages in thread
From: Matthew Wilcox @ 2026-01-26 17:36 UTC (permalink / raw)
  To: David Howells
  Cc: Christoph Hellwig, Jens Axboe, Christian Brauner, Darrick J. Wong,
	Carlos Maiolino, Qu Wenruo, Al Viro, linux-block, linux-xfs,
	linux-fsdevel, Kundan Kumar

On Fri, Jan 23, 2026 at 02:57:06PM +0000, David Howells wrote:
> Christoph Hellwig <hch@lst.de> wrote:
> 
> > On Fri, Jan 23, 2026 at 11:37:17AM +0000, David Howells wrote:
> > > Christoph Hellwig <hch@lst.de> wrote:
> > > 
> > > > +static unsigned int get_contig_folio_len(struct page **pages,
> > > > +		unsigned int *num_pages, size_t left, size_t offset)
> > > > +{
> > > > +	struct folio *folio = page_folio(pages[0]);
> > > 
> > > You can't do this.  You cannot assume that pages[0] is of folio type.
> > > vmsplice() is unfortunately a thing and the page could be a network read
> > > buffer.
> > 
> > Hmm, this just moves around existing code added in commit ed9832bc08db
> > ("block: introduce folio awareness and add a bigger size from folio").
> > 
> > How do we get these network read buffers into either a user address
> > space or a (non-bvec) iter passed to O_DIRECT reads/writes?
> 
> Splice from TCP socket to pipe, vmsplice from there into process address
> space; DIO write() from there I think should do it.

Some other ways to get something that isn't a folio mapped into a user
address space:

 - mmap() a vmalloc-allocated buffer.  We don't have a good story here
   yet; we could declare every driver that does this to be buggy and
   force them to allocate folios and vmap them.  Seems a bit
   unreasonable and likely to end up with a lot of duplicate code with
   bugs.  I've prototyped another approach, but it's not reeady to share
   yet.
 - mmap() the perf ring buffer.  We could decide to refuse to do DIO to
   this buffer.

> > How do we find out if a given page is a folio and that we can do this?
> 
> That's a question for Willy.

Today there's no way.  Although you could test for page_has_type()?

The eventual solution is that page_folio() will return NULL for pages
which do not belong to folios.  That's independent of whether we decide
to make user-mappable-vmalloc contain folios, or whether we have some
other way to map/track pages-that-belong-to-vmalloc.

^ permalink raw reply	[flat|nested] 75+ messages in thread

* Re: [PATCH 03/14] iov_iter: extract a iov_iter_extract_bvecs helper from bio code
  2026-01-26 17:36           ` Matthew Wilcox
@ 2026-01-27  5:13             ` Christoph Hellwig
  2026-01-27  5:44               ` Matthew Wilcox
  0 siblings, 1 reply; 75+ messages in thread
From: Christoph Hellwig @ 2026-01-27  5:13 UTC (permalink / raw)
  To: Matthew Wilcox
  Cc: David Howells, Christoph Hellwig, Jens Axboe, Christian Brauner,
	Darrick J. Wong, Carlos Maiolino, Qu Wenruo, Al Viro, linux-block,
	linux-xfs, linux-fsdevel, Kundan Kumar

On Mon, Jan 26, 2026 at 05:36:12PM +0000, Matthew Wilcox wrote:
> > > Hmm, this just moves around existing code added in commit ed9832bc08db
> > > ("block: introduce folio awareness and add a bigger size from folio").
> > > 
> > > How do we get these network read buffers into either a user address
> > > space or a (non-bvec) iter passed to O_DIRECT reads/writes?
> > 
> > Splice from TCP socket to pipe, vmsplice from there into process address
> > space; DIO write() from there I think should do it.
> 
> Some other ways to get something that isn't a folio mapped into a user
> address space:
> 
>  - mmap() a vmalloc-allocated buffer.  We don't have a good story here
>    yet; we could declare every driver that does this to be buggy and
>    force them to allocate folios and vmap them.  Seems a bit
>    unreasonable and likely to end up with a lot of duplicate code with
>    bugs.  I've prototyped another approach, but it's not reeady to share
>    yet.
>  - mmap() the perf ring buffer.  We could decide to refuse to do DIO to
>    this buffer.

I'm confused.  Your example are all about something that would happen if
we actually split up what is currently struct page in some way.  But I
read Dave's mail as something is broken right now already.  Which of
those is the case?

> The eventual solution is that page_folio() will return NULL for pages
> which do not belong to folios.  That's independent of whether we decide
> to make user-mappable-vmalloc contain folios, or whether we have some
> other way to map/track pages-that-belong-to-vmalloc.

vmalloc is a tiny wrapper around alloc_page* + vmap/vm_map_area, and a
lot of code all over the kernel relies on that.  Trying to have a
separate "memory type" for vmalloc is going to break things left right
and center for not much obvious gain.  I'm not going to say you can't
do it, but I doubt that is actually ends up easy and particularly
useful.

^ permalink raw reply	[flat|nested] 75+ messages in thread

* Re: [PATCH 03/14] iov_iter: extract a iov_iter_extract_bvecs helper from bio code
  2026-01-27  5:13             ` Christoph Hellwig
@ 2026-01-27  5:44               ` Matthew Wilcox
  2026-01-27  5:47                 ` Christoph Hellwig
  0 siblings, 1 reply; 75+ messages in thread
From: Matthew Wilcox @ 2026-01-27  5:44 UTC (permalink / raw)
  To: Christoph Hellwig
  Cc: David Howells, Jens Axboe, Christian Brauner, Darrick J. Wong,
	Carlos Maiolino, Qu Wenruo, Al Viro, linux-block, linux-xfs,
	linux-fsdevel, Kundan Kumar

On Tue, Jan 27, 2026 at 06:13:52AM +0100, Christoph Hellwig wrote:
> On Mon, Jan 26, 2026 at 05:36:12PM +0000, Matthew Wilcox wrote:
> > > > Hmm, this just moves around existing code added in commit ed9832bc08db
> > > > ("block: introduce folio awareness and add a bigger size from folio").
> > > > 
> > > > How do we get these network read buffers into either a user address
> > > > space or a (non-bvec) iter passed to O_DIRECT reads/writes?
> > > 
> > > Splice from TCP socket to pipe, vmsplice from there into process address
> > > space; DIO write() from there I think should do it.
> > 
> > Some other ways to get something that isn't a folio mapped into a user
> > address space:
> > 
> >  - mmap() a vmalloc-allocated buffer.  We don't have a good story here
> >    yet; we could declare every driver that does this to be buggy and
> >    force them to allocate folios and vmap them.  Seems a bit
> >    unreasonable and likely to end up with a lot of duplicate code with
> >    bugs.  I've prototyped another approach, but it's not reeady to share
> >    yet.
> >  - mmap() the perf ring buffer.  We could decide to refuse to do DIO to
> >    this buffer.
> 
> I'm confused.  Your example are all about something that would happen if
> we actually split up what is currently struct page in some way.  But I
> read Dave's mail as something is broken right now already.  Which of
> those is the case?

What's broken right now is that the network buffers are now using frozen
pages, so they have a zero refcount (Dave, do I remember the current
state of play correctly?)

> > The eventual solution is that page_folio() will return NULL for pages
> > which do not belong to folios.  That's independent of whether we decide
> > to make user-mappable-vmalloc contain folios, or whether we have some
> > other way to map/track pages-that-belong-to-vmalloc.
> 
> vmalloc is a tiny wrapper around alloc_page* + vmap/vm_map_area, and a
> lot of code all over the kernel relies on that.  Trying to have a
> separate "memory type" for vmalloc is going to break things left right
> and center for not much obvious gain.  I'm not going to say you can't
> do it, but I doubt that is actually ends up easy and particularly
> useful.

Most of the code in the kernel doesn't drill down from vmalloc to page.
I don't think it's going to be all that painful, but I also don't think
I'll need to address vmalloc in the first half of this year.  Just trying
to fill you in on the current plans.

^ permalink raw reply	[flat|nested] 75+ messages in thread

* Re: [PATCH 03/14] iov_iter: extract a iov_iter_extract_bvecs helper from bio code
  2026-01-27  5:44               ` Matthew Wilcox
@ 2026-01-27  5:47                 ` Christoph Hellwig
  0 siblings, 0 replies; 75+ messages in thread
From: Christoph Hellwig @ 2026-01-27  5:47 UTC (permalink / raw)
  To: Matthew Wilcox
  Cc: Christoph Hellwig, David Howells, Jens Axboe, Christian Brauner,
	Darrick J. Wong, Carlos Maiolino, Qu Wenruo, Al Viro, linux-block,
	linux-xfs, linux-fsdevel, Kundan Kumar

On Tue, Jan 27, 2026 at 05:44:01AM +0000, Matthew Wilcox wrote:
> > I'm confused.  Your example are all about something that would happen if
> > we actually split up what is currently struct page in some way.  But I
> > read Dave's mail as something is broken right now already.  Which of
> > those is the case?
> 
> What's broken right now is that the network buffers are now using frozen
> pages, so they have a zero refcount (Dave, do I remember the current
> state of play correctly?)

Nothing using this function right now ever deals with the page refcounts,
so that should not be an issue.

> > vmalloc is a tiny wrapper around alloc_page* + vmap/vm_map_area, and a
> > lot of code all over the kernel relies on that.  Trying to have a
> > separate "memory type" for vmalloc is going to break things left right
> > and center for not much obvious gain.  I'm not going to say you can't
> > do it, but I doubt that is actually ends up easy and particularly
> > useful.
> 
> Most of the code in the kernel doesn't drill down from vmalloc to page.
> I don't think it's going to be all that painful, but I also don't think
> I'll need to address vmalloc in the first half of this year.  Just trying
> to fill you in on the current plans.

Maybe not most of the kernel, but vmalloc_to_page and is_vmalloc_addr
are used in quite a lot of places, and usually need to handle both
actual vmalloc allocations, and page/folio allocations mapped into
vmalloc space.

^ permalink raw reply	[flat|nested] 75+ messages in thread

* Re: block or iomap tree, was: Re: bounce buffer direct I/O when stable pages are required v2
  2026-01-23 14:10     ` block or iomap tree, was: " Christoph Hellwig
@ 2026-01-27 10:31       ` Christian Brauner
  2026-01-27 12:50         ` Christoph Hellwig
  0 siblings, 1 reply; 75+ messages in thread
From: Christian Brauner @ 2026-01-27 10:31 UTC (permalink / raw)
  To: Christoph Hellwig
  Cc: Jens Axboe, Darrick J. Wong, Carlos Maiolino, Qu Wenruo, Al Viro,
	linux-block, linux-xfs, linux-fsdevel

On Fri, Jan 23, 2026 at 03:10:32PM +0100, Christoph Hellwig wrote:
> On Fri, Jan 23, 2026 at 01:24:08PM +0100, Christian Brauner wrote:
> > Applied to the vfs-7.0.iomap branch of the vfs/vfs.git tree.
> > Patches in the vfs-7.0.iomap branch should appear in linux-next soon.
> 
> Hmm, I have another minor revision in the making.  This is mostly
> spelling fixes, removing a duplicate page_folio call, adding a new
> comment and adding symbolic constants for the max bvec_iter/bio sizes.
> 
> I also have some other work that would conflict with this in the block
> layer.
> 
> What do you and Jens think of waiting for another quick respin and
> merging it through the block tree or a shared branch in the block
> tree?  There really is nothing in the iomap branch that conflicts,

I don't mind per se. I haven't pushed this into -next yet. We can also
just wait for the next merge window given how close we're cutting it.

^ permalink raw reply	[flat|nested] 75+ messages in thread

* Re: block or iomap tree, was: Re: bounce buffer direct I/O when stable pages are required v2
  2026-01-27 10:31       ` Christian Brauner
@ 2026-01-27 12:50         ` Christoph Hellwig
  0 siblings, 0 replies; 75+ messages in thread
From: Christoph Hellwig @ 2026-01-27 12:50 UTC (permalink / raw)
  To: Christian Brauner
  Cc: Christoph Hellwig, Jens Axboe, Darrick J. Wong, Carlos Maiolino,
	Qu Wenruo, Al Viro, linux-block, linux-xfs, linux-fsdevel

On Tue, Jan 27, 2026 at 11:31:30AM +0100, Christian Brauner wrote:
> > What do you and Jens think of waiting for another quick respin and
> > merging it through the block tree or a shared branch in the block
> > tree?  There really is nothing in the iomap branch that conflicts,
> 
> I don't mind per se. I haven't pushed this into -next yet. We can also
> just wait for the next merge window given how close we're cutting it.

I'd really like to see this merge window at least for this series,
so that we can finally use qemu out of the box on PI devices.

I was hoping to also get the file system PI series in, but I think
we're getting too close for it, and with that the tree might not
matter any more for this one.  It would just be nice to get the
repost, and maintainer ACKs for both block and iomap.  With you
ready to take it and Darrick's review I think that's a yes for
iomap, but I'd really like to hear from Jens.

^ permalink raw reply	[flat|nested] 75+ messages in thread

* Re: [PATCH 03/14] iov_iter: extract a iov_iter_extract_bvecs helper from bio code
  2026-01-23 14:57         ` David Howells
  2026-01-26 17:36           ` Matthew Wilcox
@ 2026-02-03  8:20           ` Askar Safin
  2026-02-03 10:28           ` Askar Safin
  2 siblings, 0 replies; 75+ messages in thread
From: Askar Safin @ 2026-02-03  8:20 UTC (permalink / raw)
  To: dhowells
  Cc: axboe, brauner, cem, djwong, hch, kundan.kumar, linux-block,
	linux-fsdevel, linux-xfs, viro, willy, wqu

David Howells <dhowells@redhat.com>:
> vmsplice from there into process address
> space

You mean vmsplice from pipe? According to this comment
(but I didn't read actual code) vmsplice *from* pipe
(as opposed to vmsplice *to* pipe) is equivalent to
normal readv:

https://elixir.bootlin.com/linux/v6.19-rc5/source/fs/splice.c#L1500


-- 
Askar Safin

^ permalink raw reply	[flat|nested] 75+ messages in thread

* Re: [PATCH 03/14] iov_iter: extract a iov_iter_extract_bvecs helper from bio code
  2026-01-23 14:57         ` David Howells
  2026-01-26 17:36           ` Matthew Wilcox
  2026-02-03  8:20           ` Askar Safin
@ 2026-02-03 10:28           ` Askar Safin
  2026-02-03 16:32             ` Christoph Hellwig
  2 siblings, 1 reply; 75+ messages in thread
From: Askar Safin @ 2026-02-03 10:28 UTC (permalink / raw)
  To: dhowells
  Cc: axboe, brauner, cem, djwong, hch, kundan.kumar, linux-block,
	linux-fsdevel, linux-xfs, viro, willy, wqu

David Howells <dhowells@redhat.com>:
> Can we make vmsplice() just copy data?

vmsplice already caused at least one security issue in the past:
CVE-2020-29374 (see https://lwn.net/Articles/849638/ ). There may be other
CVEs, try to search CVE database.

Also, I think vmsplice is rarely used.

So, if you author a patch, which makes vmsplice equivalent to readv/writev,
and mention these CVEs, then, I think, such patch has high chance to
succeed.

Also, as well as I understand, this patch introduces kbufs,
which are modern uring-based alternative to whatever splice/pipe originally
meant to be:
https://lore.kernel.org/all/20260116233044.1532965-4-joannelkoong@gmail.com/ .

I. e. these kbufs provide kernel-managed buffer for fast I/O.

So, I think it is good idea to deprecate splice in favor of these kbufs.

-- 
Askar Safin

^ permalink raw reply	[flat|nested] 75+ messages in thread

* Re: [PATCH 03/14] iov_iter: extract a iov_iter_extract_bvecs helper from bio code
  2026-02-03 10:28           ` Askar Safin
@ 2026-02-03 16:32             ` Christoph Hellwig
  0 siblings, 0 replies; 75+ messages in thread
From: Christoph Hellwig @ 2026-02-03 16:32 UTC (permalink / raw)
  To: Askar Safin
  Cc: dhowells, axboe, brauner, cem, djwong, hch, kundan.kumar,
	linux-block, linux-fsdevel, linux-xfs, viro, willy, wqu

On Tue, Feb 03, 2026 at 01:28:21PM +0300, Askar Safin wrote:
> David Howells <dhowells@redhat.com>:
> > Can we make vmsplice() just copy data?
> 
> vmsplice already caused at least one security issue in the past:
> CVE-2020-29374 (see https://lwn.net/Articles/849638/ ). There may be other
> CVEs, try to search CVE database.
> 
> Also, I think vmsplice is rarely used.
> 
> So, if you author a patch, which makes vmsplice equivalent to readv/writev,
> and mention these CVEs, then, I think, such patch has high chance to
> succeed.

I'd be all for killing it, especially as getting it to properly pin
the user buffers is still unsolved so far.


^ permalink raw reply	[flat|nested] 75+ messages in thread

end of thread, other threads:[~2026-02-03 16:32 UTC | newest]

Thread overview: 75+ messages (download: mbox.gz follow: Atom feed
-- links below jump to the message on this page --
     [not found] <CGME20260123121444epcas5p4e729259011e031a28be8379ea3b9b749@epcas5p4.samsung.com>
2026-01-19  7:44 ` bounce buffer direct I/O when stable pages are required v2 Christoph Hellwig
2026-01-19  7:44   ` [PATCH 01/14] block: refactor get_contig_folio_len Christoph Hellwig
2026-01-22 11:00     ` Johannes Thumshirn
2026-01-22 17:54     ` Darrick J. Wong
2026-01-23  8:32     ` Damien Le Moal
2026-01-23  8:35       ` Christoph Hellwig
2026-01-23  8:44         ` Damien Le Moal
2026-01-23  8:45     ` Damien Le Moal
2026-01-23 12:14     ` Anuj Gupta
2026-01-19  7:44   ` [PATCH 02/14] block: open code bio_add_page and fix handling of mismatching P2P ranges Christoph Hellwig
2026-01-22 11:04     ` Johannes Thumshirn
2026-01-22 17:59     ` Darrick J. Wong
2026-01-23  5:43       ` Christoph Hellwig
2026-01-23  7:05         ` Darrick J. Wong
2026-01-23  8:35     ` Damien Le Moal
2026-01-23 12:15     ` Anuj Gupta
2026-01-19  7:44   ` [PATCH 03/14] iov_iter: extract a iov_iter_extract_bvecs helper from bio code Christoph Hellwig
2026-01-22 17:47     ` Darrick J. Wong
2026-01-23  5:44       ` Christoph Hellwig
2026-01-23  7:09         ` Darrick J. Wong
2026-01-23  7:14           ` Christoph Hellwig
2026-01-23 11:37     ` David Howells
2026-01-23 13:58       ` Christoph Hellwig
2026-01-23 14:57         ` David Howells
2026-01-26 17:36           ` Matthew Wilcox
2026-01-27  5:13             ` Christoph Hellwig
2026-01-27  5:44               ` Matthew Wilcox
2026-01-27  5:47                 ` Christoph Hellwig
2026-02-03  8:20           ` Askar Safin
2026-02-03 10:28           ` Askar Safin
2026-02-03 16:32             ` Christoph Hellwig
2026-01-19  7:44   ` [PATCH 04/14] block: remove bio_release_page Christoph Hellwig
2026-01-22 11:14     ` Johannes Thumshirn
2026-01-22 17:26     ` Darrick J. Wong
2026-01-23  8:43     ` Damien Le Moal
2026-01-23 12:17     ` Anuj Gupta
2026-01-19  7:44   ` [PATCH 05/14] block: add helpers to bounce buffer an iov_iter into bios Christoph Hellwig
2026-01-22 13:05     ` Johannes Thumshirn
2026-01-22 17:25     ` Darrick J. Wong
2026-01-23  5:51       ` Christoph Hellwig
2026-01-23  7:11         ` Darrick J. Wong
2026-01-23  7:16           ` Christoph Hellwig
2026-01-23  8:52     ` Damien Le Moal
2026-01-23 12:20     ` Anuj Gupta
2026-01-19  7:44   ` [PATCH 06/14] iomap: fix submission side handling of completion side errors Christoph Hellwig
2026-01-19 17:40     ` Darrick J. Wong
2026-01-23  8:54     ` Damien Le Moal
2026-01-19  7:44   ` [PATCH 07/14] iomap: simplify iomap_dio_bio_iter Christoph Hellwig
2026-01-19 17:43     ` Darrick J. Wong
2026-01-23  8:55     ` Damien Le Moal
2026-01-19  7:44   ` [PATCH 08/14] iomap: split out the per-bio logic from iomap_dio_bio_iter Christoph Hellwig
2026-01-23  8:57     ` Damien Le Moal
2026-01-19  7:44   ` [PATCH 09/14] iomap: share code between iomap_dio_bio_end_io and iomap_finish_ioend_direct Christoph Hellwig
2026-01-23  8:58     ` Damien Le Moal
2026-01-19  7:44   ` [PATCH 10/14] iomap: free the bio before completing the dio Christoph Hellwig
2026-01-19 17:43     ` Darrick J. Wong
2026-01-23  8:59     ` Damien Le Moal
2026-01-19  7:44   ` [PATCH 11/14] iomap: rename IOMAP_DIO_DIRTY to IOMAP_DIO_USER_BACKED Christoph Hellwig
2026-01-23  9:00     ` Damien Le Moal
2026-01-19  7:44   ` [PATCH 12/14] iomap: support ioends for direct reads Christoph Hellwig
2026-01-23  9:02     ` Damien Le Moal
2026-01-19  7:44   ` [PATCH 13/14] iomap: add a flag to bounce buffer direct I/O Christoph Hellwig
2026-01-23  9:05     ` Damien Le Moal
2026-01-19  7:44   ` [PATCH 14/14] xfs: use bounce buffering direct I/O when the device requires stable pages Christoph Hellwig
2026-01-19 17:45     ` Darrick J. Wong
2026-01-23  9:08     ` Damien Le Moal
2026-01-23 12:10   ` bounce buffer direct I/O when stable pages are required v2 Anuj Gupta
2026-01-23 14:01     ` Christoph Hellwig
2026-01-23 14:09     ` Keith Busch
2026-01-23 12:24   ` Christian Brauner
2026-01-23 14:10     ` block or iomap tree, was: " Christoph Hellwig
2026-01-27 10:31       ` Christian Brauner
2026-01-27 12:50         ` Christoph Hellwig
2026-01-14  7:40 bounce buffer direct I/O when stable pages are required Christoph Hellwig
2026-01-14  7:41 ` [PATCH 09/14] iomap: share code between iomap_dio_bio_end_io and iomap_finish_ioend_direct Christoph Hellwig
2026-01-14 22:54   ` Darrick J. Wong

This is a public inbox, see mirroring instructions
for how to clone and mirror all data and code used for this inbox