[PATCH 1/2] block: accumulate segment page gaps per bio

linux-nvme.lists.infradead.org archive mirror
 help / color / mirror / Atom feed

* [PATCH 1/2] block: accumulate segment page gaps per bio
@ 2025-08-05 19:56 Keith Busch
  2025-08-05 19:56 ` [PATCH 2/2] nvme: remove virtual boundary for sgl capable devices Keith Busch
                   ` (3 more replies)
  0 siblings, 4 replies; 15+ messages in thread
From: Keith Busch @ 2025-08-05 19:56 UTC (permalink / raw)
  To: linux-block, linux-nvme; +Cc: hch, axboe, Keith Busch

From: Keith Busch <kbusch@kernel.org>

The blk-mq dma iteration has an optimization for requests that align to
the device's iommu merge boundary. This boundary may be larger than the
device's virtual boundary, but the code had been depending on that queue
limit to know ahead of time if the request aligns to the optimization.

Rather than rely on that queue limit, which many devices may not even
have, store the virtual boundary gaps of each segment into the bio as a
mask while checking the segments and merging. We can then quickly check
per io if the request can use the optimization or not.

Signed-off-by: Keith Busch <kbusch@kernel.org>
---
 block/blk-merge.c         | 30 +++++++++++++++++++++++++++---
 block/blk-mq-dma.c        |  3 +--
 block/blk-mq.c            |  5 +++++
 include/linux/blk-mq.h    |  6 ++++++
 include/linux/blk_types.h |  2 ++
 5 files changed, 41 insertions(+), 5 deletions(-)

diff --git a/block/blk-merge.c b/block/blk-merge.c
index 81bdad915699a..d63389c063006 100644
--- a/block/blk-merge.c
+++ b/block/blk-merge.c
@@ -278,6 +278,9 @@ static unsigned int bio_split_alignment(struct bio *bio,
 	return lim->logical_block_size;
 }
 
+#define bv_seg_gap(bv, bvprv) \
+	bv.bv_offset | ((bvprv.bv_offset + bvprv.bv_len) & (PAGE_SIZE - 1));
+
 /**
  * bio_split_rw_at - check if and where to split a read/write bio
  * @bio:  [in] bio to be split
@@ -293,9 +296,9 @@ static unsigned int bio_split_alignment(struct bio *bio,
 int bio_split_rw_at(struct bio *bio, const struct queue_limits *lim,
 		unsigned *segs, unsigned max_bytes)
 {
+	unsigned nsegs = 0, bytes = 0, page_gaps = 0;
 	struct bio_vec bv, bvprv, *bvprvp = NULL;
 	struct bvec_iter iter;
-	unsigned nsegs = 0, bytes = 0;
 
 	bio_for_each_bvec(bv, bio, iter) {
 		if (bv.bv_offset & lim->dma_alignment)
@@ -305,8 +308,11 @@ int bio_split_rw_at(struct bio *bio, const struct queue_limits *lim,
 		 * If the queue doesn't support SG gaps and adding this
 		 * offset would create a gap, disallow it.
 		 */
-		if (bvprvp && bvec_gap_to_prev(lim, bvprvp, bv.bv_offset))
-			goto split;
+		if (bvprvp) {
+			if (bvec_gap_to_prev(lim, bvprvp, bv.bv_offset))
+				goto split;
+			page_gaps |= bv_seg_gap(bv, bvprv);
+		}
 
 		if (nsegs < lim->max_segments &&
 		    bytes + bv.bv_len <= max_bytes &&
@@ -324,6 +330,7 @@ int bio_split_rw_at(struct bio *bio, const struct queue_limits *lim,
 	}
 
 	*segs = nsegs;
+	bio->page_gaps = page_gaps;
 	return 0;
 split:
 	if (bio->bi_opf & REQ_ATOMIC)
@@ -353,6 +360,7 @@ int bio_split_rw_at(struct bio *bio, const struct queue_limits *lim,
 	 * big IO can be trival, disable iopoll when split needed.
 	 */
 	bio_clear_polled(bio);
+	bio->page_gaps = page_gaps;
 	return bytes >> SECTOR_SHIFT;
 }
 EXPORT_SYMBOL_GPL(bio_split_rw_at);
@@ -696,6 +704,8 @@ static bool blk_atomic_write_mergeable_rqs(struct request *rq,
 static struct request *attempt_merge(struct request_queue *q,
 				     struct request *req, struct request *next)
 {
+	struct bio_vec bv, bvprv;
+
 	if (!rq_mergeable(req) || !rq_mergeable(next))
 		return NULL;
 
@@ -753,6 +763,10 @@ static struct request *attempt_merge(struct request_queue *q,
 	if (next->start_time_ns < req->start_time_ns)
 		req->start_time_ns = next->start_time_ns;
 
+	bv = next->bio->bi_io_vec[0];
+	bvprv = req->biotail->bi_io_vec[req->biotail->bi_vcnt - 1];
+	req->__page_gaps |= blk_rq_page_gaps(next) | bv_seg_gap(bv, bvprv);
+
 	req->biotail->bi_next = next->bio;
 	req->biotail = next->biotail;
 
@@ -861,6 +875,7 @@ enum bio_merge_status bio_attempt_back_merge(struct request *req,
 		struct bio *bio, unsigned int nr_segs)
 {
 	const blk_opf_t ff = bio_failfast(bio);
+	struct bio_vec bv, bvprv;
 
 	if (!ll_back_merge_fn(req, bio, nr_segs))
 		return BIO_MERGE_FAILED;
@@ -876,6 +891,10 @@ enum bio_merge_status bio_attempt_back_merge(struct request *req,
 	if (req->rq_flags & RQF_ZONE_WRITE_PLUGGING)
 		blk_zone_write_plug_bio_merged(bio);
 
+	bv = bio->bi_io_vec[0];
+	bvprv = req->biotail->bi_io_vec[req->biotail->bi_vcnt - 1];
+	req->__page_gaps |= bio->page_gaps | bv_seg_gap(bv, bvprv);
+
 	req->biotail->bi_next = bio;
 	req->biotail = bio;
 	req->__data_len += bio->bi_iter.bi_size;
@@ -890,6 +909,7 @@ static enum bio_merge_status bio_attempt_front_merge(struct request *req,
 		struct bio *bio, unsigned int nr_segs)
 {
 	const blk_opf_t ff = bio_failfast(bio);
+	struct bio_vec bv, bvprv;
 
 	/*
 	 * A front merge for writes to sequential zones of a zoned block device
@@ -910,6 +930,10 @@ static enum bio_merge_status bio_attempt_front_merge(struct request *req,
 
 	blk_update_mixed_merge(req, bio, true);
 
+	bv = req->bio->bi_io_vec[0];
+	bvprv = bio->bi_io_vec[bio->bi_vcnt - 1];
+	req->__page_gaps |= bio->page_gaps | bv_seg_gap(bv, bvprv);
+
 	bio->bi_next = req->bio;
 	req->bio = bio;
 
diff --git a/block/blk-mq-dma.c b/block/blk-mq-dma.c
index faa36ff6465ee..a03067c4a268f 100644
--- a/block/blk-mq-dma.c
+++ b/block/blk-mq-dma.c
@@ -73,8 +73,7 @@ static bool blk_map_iter_next(struct request *req, struct blk_map_iter *iter)
 static inline bool blk_can_dma_map_iova(struct request *req,
 		struct device *dma_dev)
 {
-	return !((queue_virt_boundary(req->q) + 1) &
-		dma_get_merge_boundary(dma_dev));
+	return !(blk_rq_page_gaps(req) & dma_get_merge_boundary(dma_dev));
 }
 
 static bool blk_dma_map_bus(struct blk_dma_iter *iter)
diff --git a/block/blk-mq.c b/block/blk-mq.c
index b67d6c02ecebd..09134a66c5666 100644
--- a/block/blk-mq.c
+++ b/block/blk-mq.c
@@ -376,6 +376,7 @@ void blk_rq_init(struct request_queue *q, struct request *rq)
 	INIT_LIST_HEAD(&rq->queuelist);
 	rq->q = q;
 	rq->__sector = (sector_t) -1;
+	rq->__page_gaps = 0;
 	INIT_HLIST_NODE(&rq->hash);
 	RB_CLEAR_NODE(&rq->rb_node);
 	rq->tag = BLK_MQ_NO_TAG;
@@ -659,6 +660,7 @@ struct request *blk_mq_alloc_request(struct request_queue *q, blk_opf_t opf,
 			goto out_queue_exit;
 	}
 	rq->__data_len = 0;
+	rq->__page_gaps = 0;
 	rq->__sector = (sector_t) -1;
 	rq->bio = rq->biotail = NULL;
 	return rq;
@@ -739,6 +741,7 @@ struct request *blk_mq_alloc_request_hctx(struct request_queue *q,
 	rq = blk_mq_rq_ctx_init(&data, blk_mq_tags_from_data(&data), tag);
 	blk_mq_rq_time_init(rq, alloc_time_ns);
 	rq->__data_len = 0;
+	rq->__page_gaps = 0;
 	rq->__sector = (sector_t) -1;
 	rq->bio = rq->biotail = NULL;
 	return rq;
@@ -2665,6 +2668,7 @@ static void blk_mq_bio_to_request(struct request *rq, struct bio *bio,
 	rq->bio = rq->biotail = bio;
 	rq->__sector = bio->bi_iter.bi_sector;
 	rq->__data_len = bio->bi_iter.bi_size;
+	rq->__page_gaps = bio->page_gaps;
 	rq->nr_phys_segments = nr_segs;
 	if (bio_integrity(bio))
 		rq->nr_integrity_segments = blk_rq_count_integrity_sg(rq->q,
@@ -3363,6 +3367,7 @@ int blk_rq_prep_clone(struct request *rq, struct request *rq_src,
 
 	/* Copy attributes of the original request to the clone request. */
 	rq->__sector = blk_rq_pos(rq_src);
+	rq->__page_gaps = blk_rq_page_gaps(rq_src);
 	rq->__data_len = blk_rq_bytes(rq_src);
 	if (rq_src->rq_flags & RQF_SPECIAL_PAYLOAD) {
 		rq->rq_flags |= RQF_SPECIAL_PAYLOAD;
diff --git a/include/linux/blk-mq.h b/include/linux/blk-mq.h
index 2a5a828f19a0b..d8f491867adc0 100644
--- a/include/linux/blk-mq.h
+++ b/include/linux/blk-mq.h
@@ -115,6 +115,7 @@ struct request {
 
 	/* the following two fields are internal, NEVER access directly */
 	unsigned int __data_len;	/* total data len */
+	unsigned int __page_gaps;	/* a mask of all the segment page gaps */
 	sector_t __sector;		/* sector cursor */
 
 	struct bio *bio;
@@ -1080,6 +1081,11 @@ static inline sector_t blk_rq_pos(const struct request *rq)
 	return rq->__sector;
 }
 
+static inline unsigned int blk_rq_page_gaps(const struct request *rq)
+{
+	return rq->__page_gaps;
+}
+
 static inline unsigned int blk_rq_bytes(const struct request *rq)
 {
 	return rq->__data_len;
diff --git a/include/linux/blk_types.h b/include/linux/blk_types.h
index 0a29b20939d17..d0ed28d40fe02 100644
--- a/include/linux/blk_types.h
+++ b/include/linux/blk_types.h
@@ -264,6 +264,8 @@ struct bio {
 
 	unsigned short		bi_max_vecs;	/* max bvl_vecs we can hold */
 
+	unsigned int		page_gaps;	/* a mask of all the vector gaps */
+
 	atomic_t		__bi_cnt;	/* pin count */
 
 	struct bio_vec		*bi_io_vec;	/* the actual vec list */
-- 
2.47.3



^ permalink raw reply related	[flat|nested] 15+ messages in thread

* [PATCH 2/2] nvme: remove virtual boundary for sgl capable devices
  2025-08-05 19:56 [PATCH 1/2] block: accumulate segment page gaps per bio Keith Busch
@ 2025-08-05 19:56 ` Keith Busch
  2025-08-06 14:55   ` Christoph Hellwig
  2025-08-05 20:32 ` [PATCH 1/2] block: accumulate segment page gaps per bio Caleb Sander Mateos
                   ` (2 subsequent siblings)
  3 siblings, 1 reply; 15+ messages in thread
From: Keith Busch @ 2025-08-05 19:56 UTC (permalink / raw)
  To: linux-block, linux-nvme; +Cc: hch, axboe, Keith Busch

From: Keith Busch <kbusch@kernel.org>

The nvme virtual boundary is only for the PRP format. Devices that can
use the SGL format don't need it for IO queues. Drop reporting it for
such PCIe devices; fabrics target will continue to use the limit.

Applications can still continue to align to it, and the driver can still
decide to use the PRP format if the IO allows it.

Signed-off-by: Keith Busch <kbusch@kernel.org>
---
 drivers/nvme/host/core.c | 14 +++++++++-----
 drivers/nvme/host/pci.c  |  2 ++
 2 files changed, 11 insertions(+), 5 deletions(-)

diff --git a/drivers/nvme/host/core.c b/drivers/nvme/host/core.c
index 812c1565114fd..c012c209c7670 100644
--- a/drivers/nvme/host/core.c
+++ b/drivers/nvme/host/core.c
@@ -2065,13 +2065,17 @@ static u32 nvme_max_drv_segments(struct nvme_ctrl *ctrl)
 }
 
 static void nvme_set_ctrl_limits(struct nvme_ctrl *ctrl,
-		struct queue_limits *lim)
+		struct queue_limits *lim, bool admin)
 {
 	lim->max_hw_sectors = ctrl->max_hw_sectors;
 	lim->max_segments = min_t(u32, USHRT_MAX,
 		min_not_zero(nvme_max_drv_segments(ctrl), ctrl->max_segments));
 	lim->max_integrity_segments = ctrl->max_integrity_segments;
-	lim->virt_boundary_mask = NVME_CTRL_PAGE_SIZE - 1;
+	if (!nvme_ctrl_sgl_supported(ctrl) || admin ||
+	    ctrl->ops->flags & NVME_F_FABRICS)
+		lim->virt_boundary_mask = NVME_CTRL_PAGE_SIZE - 1;
+	else
+		lim->virt_boundary_mask = 0;
 	lim->max_segment_size = UINT_MAX;
 	lim->dma_alignment = 3;
 }
@@ -2173,7 +2177,7 @@ static int nvme_update_ns_info_generic(struct nvme_ns *ns,
 	int ret;
 
 	lim = queue_limits_start_update(ns->disk->queue);
-	nvme_set_ctrl_limits(ns->ctrl, &lim);
+	nvme_set_ctrl_limits(ns->ctrl, &lim, false);
 
 	memflags = blk_mq_freeze_queue(ns->disk->queue);
 	ret = queue_limits_commit_update(ns->disk->queue, &lim);
@@ -2377,7 +2381,7 @@ static int nvme_update_ns_info_block(struct nvme_ns *ns,
 	ns->head->lba_shift = id->lbaf[lbaf].ds;
 	ns->head->nuse = le64_to_cpu(id->nuse);
 	capacity = nvme_lba_to_sect(ns->head, le64_to_cpu(id->nsze));
-	nvme_set_ctrl_limits(ns->ctrl, &lim);
+	nvme_set_ctrl_limits(ns->ctrl, &lim, false);
 	nvme_configure_metadata(ns->ctrl, ns->head, id, nvm, info);
 	nvme_set_chunk_sectors(ns, id, &lim);
 	if (!nvme_update_disk_info(ns, id, &lim))
@@ -3580,7 +3584,7 @@ static int nvme_init_identify(struct nvme_ctrl *ctrl)
 		min_not_zero(ctrl->max_hw_sectors, max_hw_sectors);
 
 	lim = queue_limits_start_update(ctrl->admin_q);
-	nvme_set_ctrl_limits(ctrl, &lim);
+	nvme_set_ctrl_limits(ctrl, &lim, true);
 	ret = queue_limits_commit_update(ctrl->admin_q, &lim);
 	if (ret)
 		goto out_free;
diff --git a/drivers/nvme/host/pci.c b/drivers/nvme/host/pci.c
index 4a23729f399ac..ec9eb158b43d7 100644
--- a/drivers/nvme/host/pci.c
+++ b/drivers/nvme/host/pci.c
@@ -623,6 +623,8 @@ static inline enum nvme_use_sgl nvme_pci_use_sgls(struct nvme_dev *dev,
 			return SGL_FORCED;
 		if (req->nr_integrity_segments > 1)
 			return SGL_FORCED;
+		if (blk_rq_page_gaps(req) & (NVME_CTRL_PAGE_SIZE - 1))
+			return SGL_FORCED;
 		return SGL_SUPPORTED;
 	}
 
-- 
2.47.3



^ permalink raw reply related	[flat|nested] 15+ messages in thread

* Re: [PATCH 1/2] block: accumulate segment page gaps per bio
  2025-08-05 19:56 [PATCH 1/2] block: accumulate segment page gaps per bio Keith Busch
  2025-08-05 19:56 ` [PATCH 2/2] nvme: remove virtual boundary for sgl capable devices Keith Busch
@ 2025-08-05 20:32 ` Caleb Sander Mateos
  2025-08-05 20:52   ` Keith Busch
  2025-08-05 22:52 ` Keith Busch
  2025-08-06 14:56 ` Christoph Hellwig
  3 siblings, 1 reply; 15+ messages in thread
From: Caleb Sander Mateos @ 2025-08-05 20:32 UTC (permalink / raw)
  To: Keith Busch; +Cc: linux-block, linux-nvme, hch, axboe, Keith Busch

On Tue, Aug 5, 2025 at 3:59 PM Keith Busch <kbusch@meta.com> wrote:
>
> From: Keith Busch <kbusch@kernel.org>
>
> The blk-mq dma iteration has an optimization for requests that align to
> the device's iommu merge boundary. This boundary may be larger than the
> device's virtual boundary, but the code had been depending on that queue
> limit to know ahead of time if the request aligns to the optimization.
>
> Rather than rely on that queue limit, which many devices may not even
> have, store the virtual boundary gaps of each segment into the bio as a
> mask while checking the segments and merging. We can then quickly check
> per io if the request can use the optimization or not.
>
> Signed-off-by: Keith Busch <kbusch@kernel.org>
> ---
>  block/blk-merge.c         | 30 +++++++++++++++++++++++++++---
>  block/blk-mq-dma.c        |  3 +--
>  block/blk-mq.c            |  5 +++++
>  include/linux/blk-mq.h    |  6 ++++++
>  include/linux/blk_types.h |  2 ++
>  5 files changed, 41 insertions(+), 5 deletions(-)
>
> diff --git a/block/blk-merge.c b/block/blk-merge.c
> index 81bdad915699a..d63389c063006 100644
> --- a/block/blk-merge.c
> +++ b/block/blk-merge.c
> @@ -278,6 +278,9 @@ static unsigned int bio_split_alignment(struct bio *bio,
>         return lim->logical_block_size;
>  }
>
> +#define bv_seg_gap(bv, bvprv) \
> +       bv.bv_offset | ((bvprv.bv_offset + bvprv.bv_len) & (PAGE_SIZE - 1));

Extra semicolon and missing parentheses around inputs and output. Is
there a reason not to make this a static inline function rather than a
macro?

Best,
Caleb

> +
>  /**
>   * bio_split_rw_at - check if and where to split a read/write bio
>   * @bio:  [in] bio to be split
> @@ -293,9 +296,9 @@ static unsigned int bio_split_alignment(struct bio *bio,
>  int bio_split_rw_at(struct bio *bio, const struct queue_limits *lim,
>                 unsigned *segs, unsigned max_bytes)
>  {
> +       unsigned nsegs = 0, bytes = 0, page_gaps = 0;
>         struct bio_vec bv, bvprv, *bvprvp = NULL;
>         struct bvec_iter iter;
> -       unsigned nsegs = 0, bytes = 0;
>
>         bio_for_each_bvec(bv, bio, iter) {
>                 if (bv.bv_offset & lim->dma_alignment)
> @@ -305,8 +308,11 @@ int bio_split_rw_at(struct bio *bio, const struct queue_limits *lim,
>                  * If the queue doesn't support SG gaps and adding this
>                  * offset would create a gap, disallow it.
>                  */
> -               if (bvprvp && bvec_gap_to_prev(lim, bvprvp, bv.bv_offset))
> -                       goto split;
> +               if (bvprvp) {
> +                       if (bvec_gap_to_prev(lim, bvprvp, bv.bv_offset))
> +                               goto split;
> +                       page_gaps |= bv_seg_gap(bv, bvprv);
> +               }
>
>                 if (nsegs < lim->max_segments &&
>                     bytes + bv.bv_len <= max_bytes &&
> @@ -324,6 +330,7 @@ int bio_split_rw_at(struct bio *bio, const struct queue_limits *lim,
>         }
>
>         *segs = nsegs;
> +       bio->page_gaps = page_gaps;
>         return 0;
>  split:
>         if (bio->bi_opf & REQ_ATOMIC)
> @@ -353,6 +360,7 @@ int bio_split_rw_at(struct bio *bio, const struct queue_limits *lim,
>          * big IO can be trival, disable iopoll when split needed.
>          */
>         bio_clear_polled(bio);
> +       bio->page_gaps = page_gaps;
>         return bytes >> SECTOR_SHIFT;
>  }
>  EXPORT_SYMBOL_GPL(bio_split_rw_at);
> @@ -696,6 +704,8 @@ static bool blk_atomic_write_mergeable_rqs(struct request *rq,
>  static struct request *attempt_merge(struct request_queue *q,
>                                      struct request *req, struct request *next)
>  {
> +       struct bio_vec bv, bvprv;
> +
>         if (!rq_mergeable(req) || !rq_mergeable(next))
>                 return NULL;
>
> @@ -753,6 +763,10 @@ static struct request *attempt_merge(struct request_queue *q,
>         if (next->start_time_ns < req->start_time_ns)
>                 req->start_time_ns = next->start_time_ns;
>
> +       bv = next->bio->bi_io_vec[0];
> +       bvprv = req->biotail->bi_io_vec[req->biotail->bi_vcnt - 1];
> +       req->__page_gaps |= blk_rq_page_gaps(next) | bv_seg_gap(bv, bvprv);
> +
>         req->biotail->bi_next = next->bio;
>         req->biotail = next->biotail;
>
> @@ -861,6 +875,7 @@ enum bio_merge_status bio_attempt_back_merge(struct request *req,
>                 struct bio *bio, unsigned int nr_segs)
>  {
>         const blk_opf_t ff = bio_failfast(bio);
> +       struct bio_vec bv, bvprv;
>
>         if (!ll_back_merge_fn(req, bio, nr_segs))
>                 return BIO_MERGE_FAILED;
> @@ -876,6 +891,10 @@ enum bio_merge_status bio_attempt_back_merge(struct request *req,
>         if (req->rq_flags & RQF_ZONE_WRITE_PLUGGING)
>                 blk_zone_write_plug_bio_merged(bio);
>
> +       bv = bio->bi_io_vec[0];
> +       bvprv = req->biotail->bi_io_vec[req->biotail->bi_vcnt - 1];
> +       req->__page_gaps |= bio->page_gaps | bv_seg_gap(bv, bvprv);
> +
>         req->biotail->bi_next = bio;
>         req->biotail = bio;
>         req->__data_len += bio->bi_iter.bi_size;
> @@ -890,6 +909,7 @@ static enum bio_merge_status bio_attempt_front_merge(struct request *req,
>                 struct bio *bio, unsigned int nr_segs)
>  {
>         const blk_opf_t ff = bio_failfast(bio);
> +       struct bio_vec bv, bvprv;
>
>         /*
>          * A front merge for writes to sequential zones of a zoned block device
> @@ -910,6 +930,10 @@ static enum bio_merge_status bio_attempt_front_merge(struct request *req,
>
>         blk_update_mixed_merge(req, bio, true);
>
> +       bv = req->bio->bi_io_vec[0];
> +       bvprv = bio->bi_io_vec[bio->bi_vcnt - 1];
> +       req->__page_gaps |= bio->page_gaps | bv_seg_gap(bv, bvprv);
> +
>         bio->bi_next = req->bio;
>         req->bio = bio;
>
> diff --git a/block/blk-mq-dma.c b/block/blk-mq-dma.c
> index faa36ff6465ee..a03067c4a268f 100644
> --- a/block/blk-mq-dma.c
> +++ b/block/blk-mq-dma.c
> @@ -73,8 +73,7 @@ static bool blk_map_iter_next(struct request *req, struct blk_map_iter *iter)
>  static inline bool blk_can_dma_map_iova(struct request *req,
>                 struct device *dma_dev)
>  {
> -       return !((queue_virt_boundary(req->q) + 1) &
> -               dma_get_merge_boundary(dma_dev));
> +       return !(blk_rq_page_gaps(req) & dma_get_merge_boundary(dma_dev));
>  }
>
>  static bool blk_dma_map_bus(struct blk_dma_iter *iter)
> diff --git a/block/blk-mq.c b/block/blk-mq.c
> index b67d6c02ecebd..09134a66c5666 100644
> --- a/block/blk-mq.c
> +++ b/block/blk-mq.c
> @@ -376,6 +376,7 @@ void blk_rq_init(struct request_queue *q, struct request *rq)
>         INIT_LIST_HEAD(&rq->queuelist);
>         rq->q = q;
>         rq->__sector = (sector_t) -1;
> +       rq->__page_gaps = 0;
>         INIT_HLIST_NODE(&rq->hash);
>         RB_CLEAR_NODE(&rq->rb_node);
>         rq->tag = BLK_MQ_NO_TAG;
> @@ -659,6 +660,7 @@ struct request *blk_mq_alloc_request(struct request_queue *q, blk_opf_t opf,
>                         goto out_queue_exit;
>         }
>         rq->__data_len = 0;
> +       rq->__page_gaps = 0;
>         rq->__sector = (sector_t) -1;
>         rq->bio = rq->biotail = NULL;
>         return rq;
> @@ -739,6 +741,7 @@ struct request *blk_mq_alloc_request_hctx(struct request_queue *q,
>         rq = blk_mq_rq_ctx_init(&data, blk_mq_tags_from_data(&data), tag);
>         blk_mq_rq_time_init(rq, alloc_time_ns);
>         rq->__data_len = 0;
> +       rq->__page_gaps = 0;
>         rq->__sector = (sector_t) -1;
>         rq->bio = rq->biotail = NULL;
>         return rq;
> @@ -2665,6 +2668,7 @@ static void blk_mq_bio_to_request(struct request *rq, struct bio *bio,
>         rq->bio = rq->biotail = bio;
>         rq->__sector = bio->bi_iter.bi_sector;
>         rq->__data_len = bio->bi_iter.bi_size;
> +       rq->__page_gaps = bio->page_gaps;
>         rq->nr_phys_segments = nr_segs;
>         if (bio_integrity(bio))
>                 rq->nr_integrity_segments = blk_rq_count_integrity_sg(rq->q,
> @@ -3363,6 +3367,7 @@ int blk_rq_prep_clone(struct request *rq, struct request *rq_src,
>
>         /* Copy attributes of the original request to the clone request. */
>         rq->__sector = blk_rq_pos(rq_src);
> +       rq->__page_gaps = blk_rq_page_gaps(rq_src);
>         rq->__data_len = blk_rq_bytes(rq_src);
>         if (rq_src->rq_flags & RQF_SPECIAL_PAYLOAD) {
>                 rq->rq_flags |= RQF_SPECIAL_PAYLOAD;
> diff --git a/include/linux/blk-mq.h b/include/linux/blk-mq.h
> index 2a5a828f19a0b..d8f491867adc0 100644
> --- a/include/linux/blk-mq.h
> +++ b/include/linux/blk-mq.h
> @@ -115,6 +115,7 @@ struct request {
>
>         /* the following two fields are internal, NEVER access directly */
>         unsigned int __data_len;        /* total data len */
> +       unsigned int __page_gaps;       /* a mask of all the segment page gaps */
>         sector_t __sector;              /* sector cursor */
>
>         struct bio *bio;
> @@ -1080,6 +1081,11 @@ static inline sector_t blk_rq_pos(const struct request *rq)
>         return rq->__sector;
>  }
>
> +static inline unsigned int blk_rq_page_gaps(const struct request *rq)
> +{
> +       return rq->__page_gaps;
> +}
> +
>  static inline unsigned int blk_rq_bytes(const struct request *rq)
>  {
>         return rq->__data_len;
> diff --git a/include/linux/blk_types.h b/include/linux/blk_types.h
> index 0a29b20939d17..d0ed28d40fe02 100644
> --- a/include/linux/blk_types.h
> +++ b/include/linux/blk_types.h
> @@ -264,6 +264,8 @@ struct bio {
>
>         unsigned short          bi_max_vecs;    /* max bvl_vecs we can hold */
>
> +       unsigned int            page_gaps;      /* a mask of all the vector gaps */
> +
>         atomic_t                __bi_cnt;       /* pin count */
>
>         struct bio_vec          *bi_io_vec;     /* the actual vec list */
> --
> 2.47.3
>
>


^ permalink raw reply	[flat|nested] 15+ messages in thread

* Re: [PATCH 1/2] block: accumulate segment page gaps per bio
  2025-08-05 20:32 ` [PATCH 1/2] block: accumulate segment page gaps per bio Caleb Sander Mateos
@ 2025-08-05 20:52   ` Keith Busch
  0 siblings, 0 replies; 15+ messages in thread
From: Keith Busch @ 2025-08-05 20:52 UTC (permalink / raw)
  To: Caleb Sander Mateos; +Cc: Keith Busch, linux-block, linux-nvme, hch, axboe

On Tue, Aug 05, 2025 at 04:32:22PM -0400, Caleb Sander Mateos wrote:
> On Tue, Aug 5, 2025 at 3:59 PM Keith Busch <kbusch@meta.com> wrote:
> >
> > +#define bv_seg_gap(bv, bvprv) \
> > +       bv.bv_offset | ((bvprv.bv_offset + bvprv.bv_len) & (PAGE_SIZE - 1));
> 
> Extra semicolon and missing parentheses around inputs and output. Is
> there a reason not to make this a static inline function rather than a
> macro?

Using onstack bio_vec's made sense to manipulate with macros rather than
functions. But I suppose a static inline function won't push copies on
the stack either, so sure, I can change it.


^ permalink raw reply	[flat|nested] 15+ messages in thread

* Re: [PATCH 1/2] block: accumulate segment page gaps per bio
  2025-08-05 19:56 [PATCH 1/2] block: accumulate segment page gaps per bio Keith Busch
  2025-08-05 19:56 ` [PATCH 2/2] nvme: remove virtual boundary for sgl capable devices Keith Busch
  2025-08-05 20:32 ` [PATCH 1/2] block: accumulate segment page gaps per bio Caleb Sander Mateos
@ 2025-08-05 22:52 ` Keith Busch
  2025-08-06 14:53   ` Christoph Hellwig
  2025-08-06 14:56 ` Christoph Hellwig
  3 siblings, 1 reply; 15+ messages in thread
From: Keith Busch @ 2025-08-05 22:52 UTC (permalink / raw)
  To: Keith Busch; +Cc: linux-block, linux-nvme, hch, axboe

On Tue, Aug 05, 2025 at 12:56:07PM -0700, Keith Busch wrote:
> +	bv = next->bio->bi_io_vec[0];
> +	bvprv = req->biotail->bi_io_vec[req->biotail->bi_vcnt - 1];

Hm, commit 7bcd79ac50d9d8 suggests I missed something obvious about not
being allowed to use 'bi_vcnt - 1' for the tail bvec of the previous
bio. I guess it's some stacking thing.


^ permalink raw reply	[flat|nested] 15+ messages in thread

* Re: [PATCH 1/2] block: accumulate segment page gaps per bio
  2025-08-05 22:52 ` Keith Busch
@ 2025-08-06 14:53   ` Christoph Hellwig
  2025-08-06 15:17     ` Keith Busch
  0 siblings, 1 reply; 15+ messages in thread
From: Christoph Hellwig @ 2025-08-06 14:53 UTC (permalink / raw)
  To: Keith Busch; +Cc: Keith Busch, linux-block, linux-nvme, hch, axboe

On Tue, Aug 05, 2025 at 04:52:11PM -0600, Keith Busch wrote:
> On Tue, Aug 05, 2025 at 12:56:07PM -0700, Keith Busch wrote:
> > +	bv = next->bio->bi_io_vec[0];
> > +	bvprv = req->biotail->bi_io_vec[req->biotail->bi_vcnt - 1];
> 
> Hm, commit 7bcd79ac50d9d8 suggests I missed something obvious about not
> being allowed to use 'bi_vcnt - 1' for the tail bvec of the previous
> bio. I guess it's some stacking thing.

It is bio cloning and splitting.  bi_io_vec is the original payload added
to a bio, and directly accessing it is only allowed for the submitter,
i.e. usually the file system.  The I/O stack always need to go through
the bio_iter to deal with split/cloned/truncated/partially completed and
resubmitted bios.


^ permalink raw reply	[flat|nested] 15+ messages in thread

* Re: [PATCH 2/2] nvme: remove virtual boundary for sgl capable devices
  2025-08-05 19:56 ` [PATCH 2/2] nvme: remove virtual boundary for sgl capable devices Keith Busch
@ 2025-08-06 14:55   ` Christoph Hellwig
  2025-08-06 15:04     ` Keith Busch
  0 siblings, 1 reply; 15+ messages in thread
From: Christoph Hellwig @ 2025-08-06 14:55 UTC (permalink / raw)
  To: Keith Busch; +Cc: linux-block, linux-nvme, hch, axboe, Keith Busch

On Tue, Aug 05, 2025 at 12:56:08PM -0700, Keith Busch wrote:
> From: Keith Busch <kbusch@kernel.org>
> 
> The nvme virtual boundary is only for the PRP format. Devices that can
> use the SGL format don't need it for IO queues. Drop reporting it for
> such PCIe devices; fabrics target will continue to use the limit.

That's not quite true any more as of 6.17.  We now also rely it for
efficiently mapping multiple segments into a single IOMMU mapping.
So by not enforcing it for IOMMU mode.  In many cases we're better
off splitting I/O rather forcing a non-optimized IOMMU mapping.



^ permalink raw reply	[flat|nested] 15+ messages in thread

* Re: [PATCH 1/2] block: accumulate segment page gaps per bio
  2025-08-05 19:56 [PATCH 1/2] block: accumulate segment page gaps per bio Keith Busch
                   ` (2 preceding siblings ...)
  2025-08-05 22:52 ` Keith Busch
@ 2025-08-06 14:56 ` Christoph Hellwig
  2025-08-06 15:44   ` Keith Busch
  3 siblings, 1 reply; 15+ messages in thread
From: Christoph Hellwig @ 2025-08-06 14:56 UTC (permalink / raw)
  To: Keith Busch; +Cc: linux-block, linux-nvme, hch, axboe, Keith Busch

> index 0a29b20939d17..d0ed28d40fe02 100644
> --- a/include/linux/blk_types.h
> +++ b/include/linux/blk_types.h
> @@ -264,6 +264,8 @@ struct bio {
>  
>  	unsigned short		bi_max_vecs;	/* max bvl_vecs we can hold */
>  
> +	unsigned int		page_gaps;	/* a mask of all the vector gaps */

Bloating the bio for the gaps, especially as the bio is otherwise not
built to hardware limits at all seems like an odd tradeoff.



^ permalink raw reply	[flat|nested] 15+ messages in thread

* Re: [PATCH 2/2] nvme: remove virtual boundary for sgl capable devices
  2025-08-06 14:55   ` Christoph Hellwig
@ 2025-08-06 15:04     ` Keith Busch
  0 siblings, 0 replies; 15+ messages in thread
From: Keith Busch @ 2025-08-06 15:04 UTC (permalink / raw)
  To: Christoph Hellwig; +Cc: Keith Busch, linux-block, linux-nvme, axboe

On Wed, Aug 06, 2025 at 04:55:14PM +0200, Christoph Hellwig wrote:
> On Tue, Aug 05, 2025 at 12:56:08PM -0700, Keith Busch wrote:
> > From: Keith Busch <kbusch@kernel.org>
> > 
> > The nvme virtual boundary is only for the PRP format. Devices that can
> > use the SGL format don't need it for IO queues. Drop reporting it for
> > such PCIe devices; fabrics target will continue to use the limit.
> 
> That's not quite true any more as of 6.17.  We now also rely it for
> efficiently mapping multiple segments into a single IOMMU mapping.
> So by not enforcing it for IOMMU mode.  In many cases we're better
> off splitting I/O rather forcing a non-optimized IOMMU mapping.

Patch 1 removes the reliance on the virt boundary for the IOMMU. This
makes it possible for NVMe to use this optimization on ARM64 SMMU, which
we saw earlier can come in a larger granularity than NVMe's. Without
patch 1, NVMe could never use that optimization on such an architecture,
but now it can applications that choose to subscribe to that alignment.

This patch, though, is more about being able to utilize user space
buffers directly that can not be split into any valid IO's. This is
possible now with patch one not relying on the virt boundary for IOMMU
optimizations. In truth, for my use case, the IOMMU is either set to off
or passthrough, so that optimzation isn't reachable. The use case I'm
going for is taking zero-copy receive buffers from a network device and
directly using them for storage IO. The user data doesn't arrive in
nicely aligned segments from there.

^ permalink raw reply	[flat|nested] 15+ messages in thread

* Re: [PATCH 1/2] block: accumulate segment page gaps per bio
  2025-08-06 14:53   ` Christoph Hellwig
@ 2025-08-06 15:17     ` Keith Busch
  0 siblings, 0 replies; 15+ messages in thread
From: Keith Busch @ 2025-08-06 15:17 UTC (permalink / raw)
  To: Christoph Hellwig; +Cc: Keith Busch, linux-block, linux-nvme, axboe

On Wed, Aug 06, 2025 at 04:53:34PM +0200, Christoph Hellwig wrote:
> On Tue, Aug 05, 2025 at 04:52:11PM -0600, Keith Busch wrote:
> > On Tue, Aug 05, 2025 at 12:56:07PM -0700, Keith Busch wrote:
> > > +	bv = next->bio->bi_io_vec[0];
> > > +	bvprv = req->biotail->bi_io_vec[req->biotail->bi_vcnt - 1];
> > 
> > Hm, commit 7bcd79ac50d9d8 suggests I missed something obvious about not
> > being allowed to use 'bi_vcnt - 1' for the tail bvec of the previous
> > bio. I guess it's some stacking thing.
> 
> It is bio cloning and splitting.  bi_io_vec is the original payload added
> to a bio, and directly accessing it is only allowed for the submitter,
> i.e. usually the file system.  The I/O stack always need to go through
> the bio_iter to deal with split/cloned/truncated/partially completed and
> resubmitted bios.

Yeah, I knew about the splitting, but I am only using this for merging,
and bio's split off the front can't merge, so thought I was okay.

But the cloning, yes, I messed that up. v2 is sent with a fix for it.
Sorry for the rapid fire versions; I realized you were travelling, so I
didn't expect more feedback on v1. But thank you for finding some time
to review anyway.


^ permalink raw reply	[flat|nested] 15+ messages in thread

* Re: [PATCH 1/2] block: accumulate segment page gaps per bio
  2025-08-06 14:56 ` Christoph Hellwig
@ 2025-08-06 15:44   ` Keith Busch
  2025-08-10 14:31     ` Christoph Hellwig
  0 siblings, 1 reply; 15+ messages in thread
From: Keith Busch @ 2025-08-06 15:44 UTC (permalink / raw)
  To: Christoph Hellwig; +Cc: Keith Busch, linux-block, linux-nvme, axboe

On Wed, Aug 06, 2025 at 04:56:21PM +0200, Christoph Hellwig wrote:
> > index 0a29b20939d17..d0ed28d40fe02 100644
> > --- a/include/linux/blk_types.h
> > +++ b/include/linux/blk_types.h
> > @@ -264,6 +264,8 @@ struct bio {
> >  
> >  	unsigned short		bi_max_vecs;	/* max bvl_vecs we can hold */
> >  
> > +	unsigned int		page_gaps;	/* a mask of all the vector gaps */
> 
> Bloating the bio for the gaps, especially as the bio is otherwise not
> built to hardware limits at all seems like an odd tradeoff.

Maybe, but I don't have anywhere else to put this. We split the bio to
its hardware limits at some point, which is where this field gets
initially set.

It doesn't need to be a mask (though that conceptually is the most
intuitive). It really just needs to indicate the lowest set bit of any
page gap between segments. There is a one byte hole in the bio that can
fit it without changing the bio size.

^ permalink raw reply	[flat|nested] 15+ messages in thread

* Re: [PATCH 1/2] block: accumulate segment page gaps per bio
  2025-08-06 15:44   ` Keith Busch
@ 2025-08-10 14:31     ` Christoph Hellwig
  2025-08-11 15:27       ` Keith Busch
  0 siblings, 1 reply; 15+ messages in thread
From: Christoph Hellwig @ 2025-08-10 14:31 UTC (permalink / raw)
  To: Keith Busch
  Cc: Christoph Hellwig, Keith Busch, linux-block, linux-nvme, axboe

On Wed, Aug 06, 2025 at 09:44:47AM -0600, Keith Busch wrote:
> On Wed, Aug 06, 2025 at 04:56:21PM +0200, Christoph Hellwig wrote:
> > > index 0a29b20939d17..d0ed28d40fe02 100644
> > > --- a/include/linux/blk_types.h
> > > +++ b/include/linux/blk_types.h
> > > @@ -264,6 +264,8 @@ struct bio {
> > >  
> > >  	unsigned short		bi_max_vecs;	/* max bvl_vecs we can hold */
> > >  
> > > +	unsigned int		page_gaps;	/* a mask of all the vector gaps */
> > 
> > Bloating the bio for the gaps, especially as the bio is otherwise not
> > built to hardware limits at all seems like an odd tradeoff.
> 
> Maybe, but I don't have anywhere else to put this. We split the bio to
> its hardware limits at some point, which is where this field gets
> initially set.

What we do for nr_segs is to just do it on stack while splitting,
and then only record it in the request.  I think you could do the same
here, i.e. replace the nr_segs output parameter to __bio_split_to_limits
and it's helpers with a new struct bio_split_info that in the first
version just contains nr_segs, but can be extended to other easily
derived information like the page gaps.



^ permalink raw reply	[flat|nested] 15+ messages in thread

* Re: [PATCH 1/2] block: accumulate segment page gaps per bio
  2025-08-10 14:31     ` Christoph Hellwig
@ 2025-08-11 15:27       ` Keith Busch
  2025-08-11 16:17         ` Christoph Hellwig
  0 siblings, 1 reply; 15+ messages in thread
From: Keith Busch @ 2025-08-11 15:27 UTC (permalink / raw)
  To: Christoph Hellwig; +Cc: Keith Busch, linux-block, linux-nvme, axboe

On Sun, Aug 10, 2025 at 04:31:12PM +0200, Christoph Hellwig wrote:
> On Wed, Aug 06, 2025 at 09:44:47AM -0600, Keith Busch wrote:
> > 
> > Maybe, but I don't have anywhere else to put this. We split the bio to
> > its hardware limits at some point, which is where this field gets
> > initially set.
> 
> What we do for nr_segs is to just do it on stack while splitting,
> and then only record it in the request.  I think you could do the same
> here, i.e. replace the nr_segs output parameter to __bio_split_to_limits
> and it's helpers with a new struct bio_split_info that in the first
> version just contains nr_segs, but can be extended to other easily
> derived information like the page gaps.

I initially tried to copy the nsegs usage in the request, but there are
multiple places (iomap, xfs, and btrfs) that split to hardware limits
without a request, so I'm not sure where the result is supposed to go to
be referenced later. Or do those all call the same split function later
in the generic block layer, in which case it shouldn't matter if the
upper layers already called it?


^ permalink raw reply	[flat|nested] 15+ messages in thread

* Re: [PATCH 1/2] block: accumulate segment page gaps per bio
  2025-08-11 15:27       ` Keith Busch
@ 2025-08-11 16:17         ` Christoph Hellwig
  2025-08-20 19:22           ` Keith Busch
  0 siblings, 1 reply; 15+ messages in thread
From: Christoph Hellwig @ 2025-08-11 16:17 UTC (permalink / raw)
  To: Keith Busch
  Cc: Christoph Hellwig, Keith Busch, linux-block, linux-nvme, axboe

On Mon, Aug 11, 2025 at 09:27:18AM -0600, Keith Busch wrote:
> I initially tried to copy the nsegs usage in the request, but there are
> multiple places (iomap, xfs, and btrfs) that split to hardware limits
> without a request, so I'm not sure where the result is supposed to go to
> be referenced later. Or do those all call the same split function later
> in the generic block layer, in which case it shouldn't matter if the
> upper layers already called it?

Yes, we'll always end up calling into __bio_split_to_limits in blk-mq,
no matter if someone split before.  The upper layer splits are only
for zone append users that can't later be split, but
__bio_split_to_limits is stilled called on them to count the segments
and to assert that they don't need splitting.


^ permalink raw reply	[flat|nested] 15+ messages in thread

* Re: [PATCH 1/2] block: accumulate segment page gaps per bio
  2025-08-11 16:17         ` Christoph Hellwig
@ 2025-08-20 19:22           ` Keith Busch
  0 siblings, 0 replies; 15+ messages in thread
From: Keith Busch @ 2025-08-20 19:22 UTC (permalink / raw)
  To: Christoph Hellwig; +Cc: Keith Busch, linux-block, linux-nvme, axboe

On Mon, Aug 11, 2025 at 06:17:56PM +0200, Christoph Hellwig wrote:
> On Mon, Aug 11, 2025 at 09:27:18AM -0600, Keith Busch wrote:
> > I initially tried to copy the nsegs usage in the request, but there are
> > multiple places (iomap, xfs, and btrfs) that split to hardware limits
> > without a request, so I'm not sure where the result is supposed to go to
> > be referenced later. Or do those all call the same split function later
> > in the generic block layer, in which case it shouldn't matter if the
> > upper layers already called it?
> 
> Yes, we'll always end up calling into __bio_split_to_limits in blk-mq,
> no matter if someone split before.  The upper layer splits are only
> for zone append users that can't later be split, but
> __bio_split_to_limits is stilled called on them to count the segments
> and to assert that they don't need splitting.

Zone write plugging presents a problem. For the same reason that
"__bi_nr_segments" exists, I have to stash this result somewhere in the
bio struct. I mentioned earlier I just need one byte, and there's a byte
hole in the bio already, so won't need to increase the size.


^ permalink raw reply	[flat|nested] 15+ messages in thread

end of thread, other threads:[~2025-08-20 23:15 UTC | newest]

Thread overview: 15+ messages (download: mbox.gz follow: Atom feed
-- links below jump to the message on this page --
2025-08-05 19:56 [PATCH 1/2] block: accumulate segment page gaps per bio Keith Busch
2025-08-05 19:56 ` [PATCH 2/2] nvme: remove virtual boundary for sgl capable devices Keith Busch
2025-08-06 14:55   ` Christoph Hellwig
2025-08-06 15:04     ` Keith Busch
2025-08-05 20:32 ` [PATCH 1/2] block: accumulate segment page gaps per bio Caleb Sander Mateos
2025-08-05 20:52   ` Keith Busch
2025-08-05 22:52 ` Keith Busch
2025-08-06 14:53   ` Christoph Hellwig
2025-08-06 15:17     ` Keith Busch
2025-08-06 14:56 ` Christoph Hellwig
2025-08-06 15:44   ` Keith Busch
2025-08-10 14:31     ` Christoph Hellwig
2025-08-11 15:27       ` Keith Busch
2025-08-11 16:17         ` Christoph Hellwig
2025-08-20 19:22           ` Keith Busch

This is a public inbox, see mirroring instructions
for how to clone and mirror all data and code used for this inbox;
as well as URLs for NNTP newsgroup(s).