linux-nvme.lists.infradead.org archive mirror
 help / color / mirror / Atom feed
* [PATCHv5 0/8] blk dma iter for integrity metadata
@ 2025-08-08 15:58 Keith Busch
  2025-08-08 15:58 ` [PATCHv5 1/8] blk-mq-dma: introduce blk_map_iter Keith Busch
                   ` (7 more replies)
  0 siblings, 8 replies; 21+ messages in thread
From: Keith Busch @ 2025-08-08 15:58 UTC (permalink / raw)
  To: linux-block, linux-nvme; +Cc: hch, axboe, joshi.k, Keith Busch

From: Keith Busch <kbusch@kernel.org>

Previous version:

  https://lore.kernel.org/linux-block/20250731150513.220395-1-kbusch@meta.com/

Changes since v4:

 - Fixed nvme initialization for metadata total length (Kanchan)

 - Removed the now unused __REQ_FLAG_P2PDMA (Kanchan)

 - Fixed checks with CONFIG_BLK_DEV_INTEGRITY is not set (lkp@intel.com)

Keith Busch (8):
  blk-mq-dma: introduce blk_map_iter
  blk-mq-dma: provide the bio_vec list being iterated
  blk-mq-dma: require unmap caller provide p2p map type
  blk-mq: remove REQ_P2PDMA flag
  blk-mq-dma: move common dma start code to a helper
  blk-mq-dma: add support for mapping integrity metadata
  nvme-pci: create common sgl unmapping helper
  nvme-pci: convert metadata mapping to dma iter

 block/bio.c                   |   2 +-
 block/blk-mq-dma.c            | 239 +++++++++++++++++++++-------------
 drivers/nvme/host/pci.c       | 197 +++++++++++++++-------------
 include/linux/blk-integrity.h |  17 +++
 include/linux/blk-mq-dma.h    |  18 ++-
 include/linux/blk_types.h     |   2 -
 6 files changed, 287 insertions(+), 188 deletions(-)

-- 
2.47.3



^ permalink raw reply	[flat|nested] 21+ messages in thread

* [PATCHv5 1/8] blk-mq-dma: introduce blk_map_iter
  2025-08-08 15:58 [PATCHv5 0/8] blk dma iter for integrity metadata Keith Busch
@ 2025-08-08 15:58 ` Keith Busch
  2025-08-10 14:04   ` Christoph Hellwig
  2025-08-08 15:58 ` [PATCHv5 2/8] blk-mq-dma: provide the bio_vec list being iterated Keith Busch
                   ` (6 subsequent siblings)
  7 siblings, 1 reply; 21+ messages in thread
From: Keith Busch @ 2025-08-08 15:58 UTC (permalink / raw)
  To: linux-block, linux-nvme; +Cc: hch, axboe, joshi.k, Keith Busch

From: Keith Busch <kbusch@kernel.org>

Create a type that fully captures the lower level physical address
iteration.

Signed-off-by: Keith Busch <kbusch@kernel.org>
---
 block/blk-mq-dma.c         | 81 +++++++++++++++++---------------------
 include/linux/blk-mq-dma.h |  9 ++++-
 2 files changed, 44 insertions(+), 46 deletions(-)

diff --git a/block/blk-mq-dma.c b/block/blk-mq-dma.c
index ad283017caef2..ff4c9a7e19d83 100644
--- a/block/blk-mq-dma.c
+++ b/block/blk-mq-dma.c
@@ -5,13 +5,7 @@
 #include <linux/blk-mq-dma.h>
 #include "blk.h"
 
-struct phys_vec {
-	phys_addr_t	paddr;
-	u32		len;
-};
-
-static bool blk_map_iter_next(struct request *req, struct req_iterator *iter,
-			      struct phys_vec *vec)
+static bool blk_map_iter_next(struct request *req, struct blk_map_iter *iter)
 {
 	unsigned int max_size;
 	struct bio_vec bv;
@@ -19,8 +13,8 @@ static bool blk_map_iter_next(struct request *req, struct req_iterator *iter,
 	if (req->rq_flags & RQF_SPECIAL_PAYLOAD) {
 		if (!iter->bio)
 			return false;
-		vec->paddr = bvec_phys(&req->special_vec);
-		vec->len = req->special_vec.bv_len;
+		iter->paddr = bvec_phys(&req->special_vec);
+		iter->len = req->special_vec.bv_len;
 		iter->bio = NULL;
 		return true;
 	}
@@ -29,8 +23,8 @@ static bool blk_map_iter_next(struct request *req, struct req_iterator *iter,
 		return false;
 
 	bv = mp_bvec_iter_bvec(iter->bio->bi_io_vec, iter->iter);
-	vec->paddr = bvec_phys(&bv);
-	max_size = get_max_segment_size(&req->q->limits, vec->paddr, UINT_MAX);
+	iter->paddr = bvec_phys(&bv);
+	max_size = get_max_segment_size(&req->q->limits, iter->paddr, UINT_MAX);
 	bv.bv_len = min(bv.bv_len, max_size);
 	bio_advance_iter_single(iter->bio, &iter->iter, bv.bv_len);
 
@@ -58,7 +52,7 @@ static bool blk_map_iter_next(struct request *req, struct req_iterator *iter,
 		bio_advance_iter_single(iter->bio, &iter->iter, next.bv_len);
 	}
 
-	vec->len = bv.bv_len;
+	iter->len = bv.bv_len;
 	return true;
 }
 
@@ -77,29 +71,29 @@ static inline bool blk_can_dma_map_iova(struct request *req,
 		dma_get_merge_boundary(dma_dev));
 }
 
-static bool blk_dma_map_bus(struct blk_dma_iter *iter, struct phys_vec *vec)
+static bool blk_dma_map_bus(struct blk_dma_iter *iter)
 {
-	iter->addr = pci_p2pdma_bus_addr_map(&iter->p2pdma, vec->paddr);
-	iter->len = vec->len;
+	iter->addr = pci_p2pdma_bus_addr_map(&iter->p2pdma, iter->iter.paddr);
+	iter->len = iter->iter.len;
 	return true;
 }
 
 static bool blk_dma_map_direct(struct request *req, struct device *dma_dev,
-		struct blk_dma_iter *iter, struct phys_vec *vec)
+		struct blk_dma_iter *iter)
 {
-	iter->addr = dma_map_page(dma_dev, phys_to_page(vec->paddr),
-			offset_in_page(vec->paddr), vec->len, rq_dma_dir(req));
+	iter->addr = dma_map_page(dma_dev, phys_to_page(iter->iter.paddr),
+			offset_in_page(iter->iter.paddr), iter->iter.len,
+			rq_dma_dir(req));
 	if (dma_mapping_error(dma_dev, iter->addr)) {
 		iter->status = BLK_STS_RESOURCE;
 		return false;
 	}
-	iter->len = vec->len;
+	iter->len = iter->iter.len;
 	return true;
 }
 
 static bool blk_rq_dma_map_iova(struct request *req, struct device *dma_dev,
-		struct dma_iova_state *state, struct blk_dma_iter *iter,
-		struct phys_vec *vec)
+		struct dma_iova_state *state, struct blk_dma_iter *iter)
 {
 	enum dma_data_direction dir = rq_dma_dir(req);
 	unsigned int mapped = 0;
@@ -109,12 +103,12 @@ static bool blk_rq_dma_map_iova(struct request *req, struct device *dma_dev,
 	iter->len = dma_iova_size(state);
 
 	do {
-		error = dma_iova_link(dma_dev, state, vec->paddr, mapped,
-				vec->len, dir, 0);
+		error = dma_iova_link(dma_dev, state, iter->iter.paddr, mapped,
+				iter->iter.len, dir, 0);
 		if (error)
 			break;
-		mapped += vec->len;
-	} while (blk_map_iter_next(req, &iter->iter, vec));
+		mapped += iter->iter.len;
+	} while (blk_map_iter_next(req, &iter->iter));
 
 	error = dma_iova_sync(dma_dev, state, 0, mapped);
 	if (error) {
@@ -151,7 +145,6 @@ bool blk_rq_dma_map_iter_start(struct request *req, struct device *dma_dev,
 		struct dma_iova_state *state, struct blk_dma_iter *iter)
 {
 	unsigned int total_len = blk_rq_payload_bytes(req);
-	struct phys_vec vec;
 
 	iter->iter.bio = req->bio;
 	iter->iter.iter = req->bio->bi_iter;
@@ -162,14 +155,14 @@ bool blk_rq_dma_map_iter_start(struct request *req, struct device *dma_dev,
 	 * Grab the first segment ASAP because we'll need it to check for P2P
 	 * transfers.
 	 */
-	if (!blk_map_iter_next(req, &iter->iter, &vec))
+	if (!blk_map_iter_next(req, &iter->iter))
 		return false;
 
 	if (IS_ENABLED(CONFIG_PCI_P2PDMA) && (req->cmd_flags & REQ_P2PDMA)) {
 		switch (pci_p2pdma_state(&iter->p2pdma, dma_dev,
-					 phys_to_page(vec.paddr))) {
+					 phys_to_page(iter->iter.paddr))) {
 		case PCI_P2PDMA_MAP_BUS_ADDR:
-			return blk_dma_map_bus(iter, &vec);
+			return blk_dma_map_bus(iter);
 		case PCI_P2PDMA_MAP_THRU_HOST_BRIDGE:
 			/*
 			 * P2P transfers through the host bridge are treated the
@@ -184,9 +177,9 @@ bool blk_rq_dma_map_iter_start(struct request *req, struct device *dma_dev,
 	}
 
 	if (blk_can_dma_map_iova(req, dma_dev) &&
-	    dma_iova_try_alloc(dma_dev, state, vec.paddr, total_len))
-		return blk_rq_dma_map_iova(req, dma_dev, state, iter, &vec);
-	return blk_dma_map_direct(req, dma_dev, iter, &vec);
+	    dma_iova_try_alloc(dma_dev, state, iter->iter.paddr, total_len))
+		return blk_rq_dma_map_iova(req, dma_dev, state, iter);
+	return blk_dma_map_direct(req, dma_dev, iter);
 }
 EXPORT_SYMBOL_GPL(blk_rq_dma_map_iter_start);
 
@@ -211,14 +204,12 @@ EXPORT_SYMBOL_GPL(blk_rq_dma_map_iter_start);
 bool blk_rq_dma_map_iter_next(struct request *req, struct device *dma_dev,
 		struct dma_iova_state *state, struct blk_dma_iter *iter)
 {
-	struct phys_vec vec;
-
-	if (!blk_map_iter_next(req, &iter->iter, &vec))
+	if (!blk_map_iter_next(req, &iter->iter))
 		return false;
 
 	if (iter->p2pdma.map == PCI_P2PDMA_MAP_BUS_ADDR)
-		return blk_dma_map_bus(iter, &vec);
-	return blk_dma_map_direct(req, dma_dev, iter, &vec);
+		return blk_dma_map_bus(iter);
+	return blk_dma_map_direct(req, dma_dev, iter);
 }
 EXPORT_SYMBOL_GPL(blk_rq_dma_map_iter_next);
 
@@ -246,20 +237,20 @@ blk_next_sg(struct scatterlist **sg, struct scatterlist *sglist)
 int __blk_rq_map_sg(struct request *rq, struct scatterlist *sglist,
 		    struct scatterlist **last_sg)
 {
-	struct req_iterator iter = {
-		.bio	= rq->bio,
+	struct bio *bio = rq->bio;
+	struct blk_map_iter iter = {
+		.bio	= bio,
 	};
-	struct phys_vec vec;
 	int nsegs = 0;
 
 	/* the internal flush request may not have bio attached */
-	if (iter.bio)
-		iter.iter = iter.bio->bi_iter;
+	if (bio)
+		iter.iter = bio->bi_iter;
 
-	while (blk_map_iter_next(rq, &iter, &vec)) {
+	while (blk_map_iter_next(rq, &iter)) {
 		*last_sg = blk_next_sg(last_sg, sglist);
-		sg_set_page(*last_sg, phys_to_page(vec.paddr), vec.len,
-				offset_in_page(vec.paddr));
+		sg_set_page(*last_sg, phys_to_page(iter.paddr), iter.len,
+				offset_in_page(iter.paddr));
 		nsegs++;
 	}
 
diff --git a/include/linux/blk-mq-dma.h b/include/linux/blk-mq-dma.h
index c26a01aeae006..1e5988afdb978 100644
--- a/include/linux/blk-mq-dma.h
+++ b/include/linux/blk-mq-dma.h
@@ -5,6 +5,13 @@
 #include <linux/blk-mq.h>
 #include <linux/pci-p2pdma.h>
 
+struct blk_map_iter {
+	phys_addr_t			paddr;
+	u32				len;
+	struct bvec_iter		iter;
+	struct bio			*bio;
+};
+
 struct blk_dma_iter {
 	/* Output address range for this iteration */
 	dma_addr_t			addr;
@@ -14,7 +21,7 @@ struct blk_dma_iter {
 	blk_status_t			status;
 
 	/* Internal to blk_rq_dma_map_iter_* */
-	struct req_iterator		iter;
+	struct blk_map_iter		iter;
 	struct pci_p2pdma_map_state	p2pdma;
 };
 
-- 
2.47.3



^ permalink raw reply related	[flat|nested] 21+ messages in thread

* [PATCHv5 2/8] blk-mq-dma: provide the bio_vec list being iterated
  2025-08-08 15:58 [PATCHv5 0/8] blk dma iter for integrity metadata Keith Busch
  2025-08-08 15:58 ` [PATCHv5 1/8] blk-mq-dma: introduce blk_map_iter Keith Busch
@ 2025-08-08 15:58 ` Keith Busch
  2025-08-10 14:07   ` Christoph Hellwig
  2025-08-10 14:09   ` Christoph Hellwig
  2025-08-08 15:58 ` [PATCHv5 3/8] blk-mq-dma: require unmap caller provide p2p map type Keith Busch
                   ` (5 subsequent siblings)
  7 siblings, 2 replies; 21+ messages in thread
From: Keith Busch @ 2025-08-08 15:58 UTC (permalink / raw)
  To: linux-block, linux-nvme; +Cc: hch, axboe, joshi.k, Keith Busch

From: Keith Busch <kbusch@kernel.org>

This will make it easier to add different sources of the bvec table,
like for upcoming integrity support, rather than assume to use the bio's
bi_io_vec. It also makes iterating "special" payloads more in common
with iterating normal payloads.

Signed-off-by: Keith Busch <kbusch@kernel.org>
---
 block/blk-mq-dma.c         | 56 ++++++++++++++++++++++----------------
 include/linux/blk-mq-dma.h |  1 +
 2 files changed, 33 insertions(+), 24 deletions(-)

diff --git a/block/blk-mq-dma.c b/block/blk-mq-dma.c
index ff4c9a7e19d83..4a013703bcba5 100644
--- a/block/blk-mq-dma.c
+++ b/block/blk-mq-dma.c
@@ -10,23 +10,14 @@ static bool blk_map_iter_next(struct request *req, struct blk_map_iter *iter)
 	unsigned int max_size;
 	struct bio_vec bv;
 
-	if (req->rq_flags & RQF_SPECIAL_PAYLOAD) {
-		if (!iter->bio)
-			return false;
-		iter->paddr = bvec_phys(&req->special_vec);
-		iter->len = req->special_vec.bv_len;
-		iter->bio = NULL;
-		return true;
-	}
-
 	if (!iter->iter.bi_size)
 		return false;
 
-	bv = mp_bvec_iter_bvec(iter->bio->bi_io_vec, iter->iter);
+	bv = mp_bvec_iter_bvec(iter->bvec, iter->iter);
 	iter->paddr = bvec_phys(&bv);
 	max_size = get_max_segment_size(&req->q->limits, iter->paddr, UINT_MAX);
 	bv.bv_len = min(bv.bv_len, max_size);
-	bio_advance_iter_single(iter->bio, &iter->iter, bv.bv_len);
+	bvec_iter_advance_single(iter->bvec, &iter->iter, bv.bv_len);
 
 	/*
 	 * If we are entirely done with this bi_io_vec entry, check if the next
@@ -37,19 +28,20 @@ static bool blk_map_iter_next(struct request *req, struct blk_map_iter *iter)
 		struct bio_vec next;
 
 		if (!iter->iter.bi_size) {
-			if (!iter->bio->bi_next)
+			if (!iter->bio || !iter->bio->bi_next)
 				break;
 			iter->bio = iter->bio->bi_next;
 			iter->iter = iter->bio->bi_iter;
+			iter->bvec = iter->bio->bi_io_vec;
 		}
 
-		next = mp_bvec_iter_bvec(iter->bio->bi_io_vec, iter->iter);
+		next = mp_bvec_iter_bvec(iter->bvec, iter->iter);
 		if (bv.bv_len + next.bv_len > max_size ||
 		    !biovec_phys_mergeable(req->q, &bv, &next))
 			break;
 
 		bv.bv_len += next.bv_len;
-		bio_advance_iter_single(iter->bio, &iter->iter, next.bv_len);
+		bvec_iter_advance_single(iter->bvec, &iter->iter, next.bv_len);
 	}
 
 	iter->len = bv.bv_len;
@@ -119,6 +111,30 @@ static bool blk_rq_dma_map_iova(struct request *req, struct device *dma_dev,
 	return true;
 }
 
+static struct blk_map_iter blk_rq_map_iter(struct request *rq)
+{
+	struct bio *bio = rq->bio;
+
+	if (rq->rq_flags & RQF_SPECIAL_PAYLOAD) {
+		return (struct blk_map_iter) {
+			.bvec = &rq->special_vec,
+			.iter = {
+				.bi_size = rq->special_vec.bv_len,
+			}
+		};
+       }
+
+	/* the internal flush request may not have bio attached */
+	if (!bio)
+	        return (struct blk_map_iter) {};
+
+	return (struct blk_map_iter) {
+		.bio = bio,
+		.bvec = bio->bi_io_vec,
+		.iter = bio->bi_iter,
+	};
+}
+
 /**
  * blk_rq_dma_map_iter_start - map the first DMA segment for a request
  * @req:	request to map
@@ -146,10 +162,9 @@ bool blk_rq_dma_map_iter_start(struct request *req, struct device *dma_dev,
 {
 	unsigned int total_len = blk_rq_payload_bytes(req);
 
-	iter->iter.bio = req->bio;
-	iter->iter.iter = req->bio->bi_iter;
 	memset(&iter->p2pdma, 0, sizeof(iter->p2pdma));
 	iter->status = BLK_STS_OK;
+	iter->iter = blk_rq_map_iter(req);
 
 	/*
 	 * Grab the first segment ASAP because we'll need it to check for P2P
@@ -237,16 +252,9 @@ blk_next_sg(struct scatterlist **sg, struct scatterlist *sglist)
 int __blk_rq_map_sg(struct request *rq, struct scatterlist *sglist,
 		    struct scatterlist **last_sg)
 {
-	struct bio *bio = rq->bio;
-	struct blk_map_iter iter = {
-		.bio	= bio,
-	};
+	struct blk_map_iter iter = blk_rq_map_iter(rq);
 	int nsegs = 0;
 
-	/* the internal flush request may not have bio attached */
-	if (bio)
-		iter.iter = bio->bi_iter;
-
 	while (blk_map_iter_next(rq, &iter)) {
 		*last_sg = blk_next_sg(last_sg, sglist);
 		sg_set_page(*last_sg, phys_to_page(iter.paddr), iter.len,
diff --git a/include/linux/blk-mq-dma.h b/include/linux/blk-mq-dma.h
index 1e5988afdb978..c82f880dee914 100644
--- a/include/linux/blk-mq-dma.h
+++ b/include/linux/blk-mq-dma.h
@@ -8,6 +8,7 @@
 struct blk_map_iter {
 	phys_addr_t			paddr;
 	u32				len;
+	struct bio_vec                  *bvec;
 	struct bvec_iter		iter;
 	struct bio			*bio;
 };
-- 
2.47.3



^ permalink raw reply related	[flat|nested] 21+ messages in thread

* [PATCHv5 3/8] blk-mq-dma: require unmap caller provide p2p map type
  2025-08-08 15:58 [PATCHv5 0/8] blk dma iter for integrity metadata Keith Busch
  2025-08-08 15:58 ` [PATCHv5 1/8] blk-mq-dma: introduce blk_map_iter Keith Busch
  2025-08-08 15:58 ` [PATCHv5 2/8] blk-mq-dma: provide the bio_vec list being iterated Keith Busch
@ 2025-08-08 15:58 ` Keith Busch
  2025-08-10 14:08   ` Christoph Hellwig
  2025-08-08 15:58 ` [PATCHv5 4/8] blk-mq: remove REQ_P2PDMA flag Keith Busch
                   ` (4 subsequent siblings)
  7 siblings, 1 reply; 21+ messages in thread
From: Keith Busch @ 2025-08-08 15:58 UTC (permalink / raw)
  To: linux-block, linux-nvme; +Cc: hch, axboe, joshi.k, Keith Busch

From: Keith Busch <kbusch@kernel.org>

In preparing for integrity dma mappings, we can't rely on the request
flag because data and metadata may have different mapping types.

Signed-off-by: Keith Busch <kbusch@kernel.org>
---
 drivers/nvme/host/pci.c    | 9 ++++++++-
 include/linux/blk-mq-dma.h | 5 +++--
 2 files changed, 11 insertions(+), 3 deletions(-)

diff --git a/drivers/nvme/host/pci.c b/drivers/nvme/host/pci.c
index 2c6d9506b1725..111b6bc6c93eb 100644
--- a/drivers/nvme/host/pci.c
+++ b/drivers/nvme/host/pci.c
@@ -261,6 +261,9 @@ enum nvme_iod_flags {
 
 	/* single segment dma mapping */
 	IOD_SINGLE_SEGMENT	= 1U << 2,
+
+	/* DMA mapped with PCI_P2PDMA_MAP_BUS_ADDR */
+	IOD_P2P_BUS_ADDR	= 1U << 3,
 };
 
 struct nvme_dma_vec {
@@ -725,7 +728,8 @@ static void nvme_unmap_data(struct request *req)
 		return;
 	}
 
-	if (!blk_rq_dma_unmap(req, dma_dev, &iod->dma_state, iod->total_len)) {
+	if (!blk_rq_dma_unmap(req, dma_dev, &iod->dma_state, iod->total_len,
+				iod->flags & IOD_P2P_BUS_ADDR)) {
 		if (nvme_pci_cmd_use_sgl(&iod->cmd))
 			nvme_free_sgls(req);
 		else
@@ -1000,6 +1004,9 @@ static blk_status_t nvme_map_data(struct request *req)
 	if (!blk_rq_dma_map_iter_start(req, dev->dev, &iod->dma_state, &iter))
 		return iter.status;
 
+	if (iter.p2pdma.map == PCI_P2PDMA_MAP_BUS_ADDR)
+		iod->flags |= IOD_P2P_BUS_ADDR;
+
 	if (use_sgl == SGL_FORCED ||
 	    (use_sgl == SGL_SUPPORTED &&
 	     (sgl_threshold && nvme_pci_avg_seg_size(req) >= sgl_threshold)))
diff --git a/include/linux/blk-mq-dma.h b/include/linux/blk-mq-dma.h
index c82f880dee914..aef8d784952ff 100644
--- a/include/linux/blk-mq-dma.h
+++ b/include/linux/blk-mq-dma.h
@@ -49,14 +49,15 @@ static inline bool blk_rq_dma_map_coalesce(struct dma_iova_state *state)
  * @dma_dev:	device to unmap from
  * @state:	DMA IOVA state
  * @mapped_len: number of bytes to unmap
+ * @is_p2p:	true if mapped with PCI_P2PDMA_MAP_BUS_ADDR
  *
  * Returns %false if the callers need to manually unmap every DMA segment
  * mapped using @iter or %true if no work is left to be done.
  */
 static inline bool blk_rq_dma_unmap(struct request *req, struct device *dma_dev,
-		struct dma_iova_state *state, size_t mapped_len)
+		struct dma_iova_state *state, size_t mapped_len, bool is_p2p)
 {
-	if (req->cmd_flags & REQ_P2PDMA)
+	if (is_p2p)
 		return true;
 
 	if (dma_use_iova(state)) {
-- 
2.47.3



^ permalink raw reply related	[flat|nested] 21+ messages in thread

* [PATCHv5 4/8] blk-mq: remove REQ_P2PDMA flag
  2025-08-08 15:58 [PATCHv5 0/8] blk dma iter for integrity metadata Keith Busch
                   ` (2 preceding siblings ...)
  2025-08-08 15:58 ` [PATCHv5 3/8] blk-mq-dma: require unmap caller provide p2p map type Keith Busch
@ 2025-08-08 15:58 ` Keith Busch
  2025-08-10 14:08   ` Christoph Hellwig
  2025-08-08 15:58 ` [PATCHv5 5/8] blk-mq-dma: move common dma start code to a helper Keith Busch
                   ` (3 subsequent siblings)
  7 siblings, 1 reply; 21+ messages in thread
From: Keith Busch @ 2025-08-08 15:58 UTC (permalink / raw)
  To: linux-block, linux-nvme; +Cc: hch, axboe, joshi.k, Keith Busch

From: Keith Busch <kbusch@kernel.org>

It's not serving any particular purpose. pci_p2pdma_state() already has
all the appropriate checks, so the config and flag checks are not
guarding anything.

Signed-off-by: Keith Busch <kbusch@kernel.org>
---
 block/bio.c               |  2 +-
 block/blk-mq-dma.c        | 30 ++++++++++++++----------------
 include/linux/blk_types.h |  2 --
 3 files changed, 15 insertions(+), 19 deletions(-)

diff --git a/block/bio.c b/block/bio.c
index 92c512e876c8d..f56d285e6958e 100644
--- a/block/bio.c
+++ b/block/bio.c
@@ -981,7 +981,7 @@ void __bio_add_page(struct bio *bio, struct page *page,
 	WARN_ON_ONCE(bio_full(bio, len));
 
 	if (is_pci_p2pdma_page(page))
-		bio->bi_opf |= REQ_P2PDMA | REQ_NOMERGE;
+		bio->bi_opf |= REQ_NOMERGE;
 
 	bvec_set_page(&bio->bi_io_vec[bio->bi_vcnt], page, len, off);
 	bio->bi_iter.bi_size += len;
diff --git a/block/blk-mq-dma.c b/block/blk-mq-dma.c
index 4a013703bcba5..988c27667df67 100644
--- a/block/blk-mq-dma.c
+++ b/block/blk-mq-dma.c
@@ -173,22 +173,20 @@ bool blk_rq_dma_map_iter_start(struct request *req, struct device *dma_dev,
 	if (!blk_map_iter_next(req, &iter->iter))
 		return false;
 
-	if (IS_ENABLED(CONFIG_PCI_P2PDMA) && (req->cmd_flags & REQ_P2PDMA)) {
-		switch (pci_p2pdma_state(&iter->p2pdma, dma_dev,
-					 phys_to_page(iter->iter.paddr))) {
-		case PCI_P2PDMA_MAP_BUS_ADDR:
-			return blk_dma_map_bus(iter);
-		case PCI_P2PDMA_MAP_THRU_HOST_BRIDGE:
-			/*
-			 * P2P transfers through the host bridge are treated the
-			 * same as non-P2P transfers below and during unmap.
-			 */
-			req->cmd_flags &= ~REQ_P2PDMA;
-			break;
-		default:
-			iter->status = BLK_STS_INVAL;
-			return false;
-		}
+	switch (pci_p2pdma_state(&iter->p2pdma, dma_dev,
+				 phys_to_page(iter->iter.paddr))) {
+	case PCI_P2PDMA_MAP_BUS_ADDR:
+		return blk_dma_map_bus(iter);
+	case PCI_P2PDMA_MAP_THRU_HOST_BRIDGE:
+		/*
+		 * P2P transfers through the host bridge are treated the
+		 * same as non-P2P transfers below and during unmap.
+		 */
+	case PCI_P2PDMA_MAP_NONE:
+		break;
+	default:
+		iter->status = BLK_STS_INVAL;
+		return false;
 	}
 
 	if (blk_can_dma_map_iova(req, dma_dev) &&
diff --git a/include/linux/blk_types.h b/include/linux/blk_types.h
index 09b99d52fd365..930daff207df2 100644
--- a/include/linux/blk_types.h
+++ b/include/linux/blk_types.h
@@ -386,7 +386,6 @@ enum req_flag_bits {
 	__REQ_DRV,		/* for driver use */
 	__REQ_FS_PRIVATE,	/* for file system (submitter) use */
 	__REQ_ATOMIC,		/* for atomic write operations */
-	__REQ_P2PDMA,		/* contains P2P DMA pages */
 	/*
 	 * Command specific flags, keep last:
 	 */
@@ -419,7 +418,6 @@ enum req_flag_bits {
 #define REQ_DRV		(__force blk_opf_t)(1ULL << __REQ_DRV)
 #define REQ_FS_PRIVATE	(__force blk_opf_t)(1ULL << __REQ_FS_PRIVATE)
 #define REQ_ATOMIC	(__force blk_opf_t)(1ULL << __REQ_ATOMIC)
-#define REQ_P2PDMA	(__force blk_opf_t)(1ULL << __REQ_P2PDMA)
 
 #define REQ_NOUNMAP	(__force blk_opf_t)(1ULL << __REQ_NOUNMAP)
 
-- 
2.47.3



^ permalink raw reply related	[flat|nested] 21+ messages in thread

* [PATCHv5 5/8] blk-mq-dma: move common dma start code to a helper
  2025-08-08 15:58 [PATCHv5 0/8] blk dma iter for integrity metadata Keith Busch
                   ` (3 preceding siblings ...)
  2025-08-08 15:58 ` [PATCHv5 4/8] blk-mq: remove REQ_P2PDMA flag Keith Busch
@ 2025-08-08 15:58 ` Keith Busch
  2025-08-10 14:10   ` Christoph Hellwig
  2025-08-08 15:58 ` [PATCHv5 6/8] blk-mq-dma: add support for mapping integrity metadata Keith Busch
                   ` (2 subsequent siblings)
  7 siblings, 1 reply; 21+ messages in thread
From: Keith Busch @ 2025-08-08 15:58 UTC (permalink / raw)
  To: linux-block, linux-nvme; +Cc: hch, axboe, joshi.k, Keith Busch

From: Keith Busch <kbusch@kernel.org>

In preparing for dma mapping integrity metadata, move the common dma
setup to a helper.

Signed-off-by: Keith Busch <kbusch@kernel.org>
---
 block/blk-mq-dma.c | 60 +++++++++++++++++++++++++---------------------
 1 file changed, 33 insertions(+), 27 deletions(-)

diff --git a/block/blk-mq-dma.c b/block/blk-mq-dma.c
index 988c27667df67..bc694fecb39dc 100644
--- a/block/blk-mq-dma.c
+++ b/block/blk-mq-dma.c
@@ -135,36 +135,12 @@ static struct blk_map_iter blk_rq_map_iter(struct request *rq)
 	};
 }
 
-/**
- * blk_rq_dma_map_iter_start - map the first DMA segment for a request
- * @req:	request to map
- * @dma_dev:	device to map to
- * @state:	DMA IOVA state
- * @iter:	block layer DMA iterator
- *
- * Start DMA mapping @req to @dma_dev.  @state and @iter are provided by the
- * caller and don't need to be initialized.  @state needs to be stored for use
- * at unmap time, @iter is only needed at map time.
- *
- * Returns %false if there is no segment to map, including due to an error, or
- * %true ft it did map a segment.
- *
- * If a segment was mapped, the DMA address for it is returned in @iter.addr and
- * the length in @iter.len.  If no segment was mapped the status code is
- * returned in @iter.status.
- *
- * The caller can call blk_rq_dma_map_coalesce() to check if further segments
- * need to be mapped after this, or go straight to blk_rq_dma_map_iter_next()
- * to try to map the following segments.
- */
-bool blk_rq_dma_map_iter_start(struct request *req, struct device *dma_dev,
-		struct dma_iova_state *state, struct blk_dma_iter *iter)
+static bool blk_dma_map_iter_start(struct request *req, struct device *dma_dev,
+		struct dma_iova_state *state, struct blk_dma_iter *iter,
+		unsigned int total_len)
 {
-	unsigned int total_len = blk_rq_payload_bytes(req);
-
 	memset(&iter->p2pdma, 0, sizeof(iter->p2pdma));
 	iter->status = BLK_STS_OK;
-	iter->iter = blk_rq_map_iter(req);
 
 	/*
 	 * Grab the first segment ASAP because we'll need it to check for P2P
@@ -194,6 +170,36 @@ bool blk_rq_dma_map_iter_start(struct request *req, struct device *dma_dev,
 		return blk_rq_dma_map_iova(req, dma_dev, state, iter);
 	return blk_dma_map_direct(req, dma_dev, iter);
 }
+
+/**
+ * blk_rq_dma_map_iter_start - map the first DMA segment for a request
+ * @req:	request to map
+ * @dma_dev:	device to map to
+ * @state:	DMA IOVA state
+ * @iter:	block layer DMA iterator
+ *
+ * Start DMA mapping @req to @dma_dev.  @state and @iter are provided by the
+ * caller and don't need to be initialized.  @state needs to be stored for use
+ * at unmap time, @iter is only needed at map time.
+ *
+ * Returns %false if there is no segment to map, including due to an error, or
+ * %true ft it did map a segment.
+ *
+ * If a segment was mapped, the DMA address for it is returned in @iter.addr and
+ * the length in @iter.len.  If no segment was mapped the status code is
+ * returned in @iter.status.
+ *
+ * The caller can call blk_rq_dma_map_coalesce() to check if further segments
+ * need to be mapped after this, or go straight to blk_rq_dma_map_iter_next()
+ * to try to map the following segments.
+ */
+bool blk_rq_dma_map_iter_start(struct request *req, struct device *dma_dev,
+		struct dma_iova_state *state, struct blk_dma_iter *iter)
+{
+	iter->iter = blk_rq_map_iter(req);
+	return blk_dma_map_iter_start(req, dma_dev, state, iter,
+				      blk_rq_payload_bytes(req));
+}
 EXPORT_SYMBOL_GPL(blk_rq_dma_map_iter_start);
 
 /**
-- 
2.47.3



^ permalink raw reply related	[flat|nested] 21+ messages in thread

* [PATCHv5 6/8] blk-mq-dma: add support for mapping integrity metadata
  2025-08-08 15:58 [PATCHv5 0/8] blk dma iter for integrity metadata Keith Busch
                   ` (4 preceding siblings ...)
  2025-08-08 15:58 ` [PATCHv5 5/8] blk-mq-dma: move common dma start code to a helper Keith Busch
@ 2025-08-08 15:58 ` Keith Busch
  2025-08-10 14:16   ` Christoph Hellwig
  2025-08-08 15:58 ` [PATCHv5 7/8] nvme-pci: create common sgl unmapping helper Keith Busch
  2025-08-08 15:58 ` [PATCHv5 8/8] nvme-pci: convert metadata mapping to dma iter Keith Busch
  7 siblings, 1 reply; 21+ messages in thread
From: Keith Busch @ 2025-08-08 15:58 UTC (permalink / raw)
  To: linux-block, linux-nvme; +Cc: hch, axboe, joshi.k, Keith Busch

From: Keith Busch <kbusch@kernel.org>

Provide integrity metadata helpers equivalent to the data payload
helpers for iterating a request for dma setup.

Signed-off-by: Keith Busch <kbusch@kernel.org>
---
 block/blk-mq-dma.c            | 62 +++++++++++++++++++++++++++++++----
 include/linux/blk-integrity.h | 17 ++++++++++
 include/linux/blk-mq-dma.h    |  3 ++
 3 files changed, 75 insertions(+), 7 deletions(-)

diff --git a/block/blk-mq-dma.c b/block/blk-mq-dma.c
index bc694fecb39dc..6acf46679c96a 100644
--- a/block/blk-mq-dma.c
+++ b/block/blk-mq-dma.c
@@ -2,9 +2,30 @@
 /*
  * Copyright (C) 2025 Christoph Hellwig
  */
+#include <linux/blk-integrity.h>
 #include <linux/blk-mq-dma.h>
 #include "blk.h"
 
+static bool __blk_map_iter_next(struct blk_map_iter *iter)
+{
+	if (iter->iter.bi_size)
+		return true;
+	if (!iter->bio || !iter->bio->bi_next)
+		return false;
+
+	iter->bio = iter->bio->bi_next;
+#ifdef CONFIG_BLK_DEV_INTEGRITY
+	if (iter->is_integrity) {
+		iter->iter = iter->bio->bi_integrity->bip_iter;
+		iter->bvec = iter->bio->bi_integrity->bip_vec;
+		return true;
+	}
+#endif
+	iter->iter = iter->bio->bi_iter;
+	iter->bvec = iter->bio->bi_io_vec;
+	return true;
+}
+
 static bool blk_map_iter_next(struct request *req, struct blk_map_iter *iter)
 {
 	unsigned int max_size;
@@ -27,13 +48,8 @@ static bool blk_map_iter_next(struct request *req, struct blk_map_iter *iter)
 	while (!iter->iter.bi_size || !iter->iter.bi_bvec_done) {
 		struct bio_vec next;
 
-		if (!iter->iter.bi_size) {
-			if (!iter->bio || !iter->bio->bi_next)
-				break;
-			iter->bio = iter->bio->bi_next;
-			iter->iter = iter->bio->bi_iter;
-			iter->bvec = iter->bio->bi_io_vec;
-		}
+		if (!__blk_map_iter_next(iter))
+			break;
 
 		next = mp_bvec_iter_bvec(iter->bvec, iter->iter);
 		if (bv.bv_len + next.bv_len > max_size ||
@@ -232,6 +248,38 @@ bool blk_rq_dma_map_iter_next(struct request *req, struct device *dma_dev,
 }
 EXPORT_SYMBOL_GPL(blk_rq_dma_map_iter_next);
 
+#ifdef CONFIG_BLK_DEV_INTEGRITY
+bool blk_rq_integrity_dma_map_iter_start(struct request *req,
+		struct device *dma_dev,  struct dma_iova_state *state,
+		struct blk_dma_iter *iter)
+{
+	unsigned len = bio_integrity_bytes(&req->q->limits.integrity,
+					   blk_rq_sectors(req));
+	struct bio *bio = req->bio;
+
+	iter->iter = (struct blk_map_iter) {
+		.bio = bio,
+		.iter = bio->bi_integrity->bip_iter,
+		.bvec = bio->bi_integrity->bip_vec,
+		.is_integrity = true,
+	};
+	return blk_dma_map_iter_start(req, dma_dev, state, iter, len);
+}
+EXPORT_SYMBOL_GPL(blk_rq_integrity_dma_map_iter_start);
+
+bool blk_rq_integrity_dma_map_iter_next(struct request *req,
+               struct device *dma_dev, struct blk_dma_iter *iter)
+{
+	if (!blk_map_iter_next(req, &iter->iter))
+		return false;
+
+	if (iter->p2pdma.map == PCI_P2PDMA_MAP_BUS_ADDR)
+		return blk_dma_map_bus(iter);
+	return blk_dma_map_direct(req, dma_dev, iter);
+}
+EXPORT_SYMBOL_GPL(blk_rq_integrity_dma_map_iter_next);
+#endif
+
 static inline struct scatterlist *
 blk_next_sg(struct scatterlist **sg, struct scatterlist *sglist)
 {
diff --git a/include/linux/blk-integrity.h b/include/linux/blk-integrity.h
index e67a2b6e8f111..78fe2459e6612 100644
--- a/include/linux/blk-integrity.h
+++ b/include/linux/blk-integrity.h
@@ -4,6 +4,7 @@
 
 #include <linux/blk-mq.h>
 #include <linux/bio-integrity.h>
+#include <linux/blk-mq-dma.h>
 
 struct request;
 
@@ -31,6 +32,11 @@ int blk_rq_integrity_map_user(struct request *rq, void __user *ubuf,
 			      ssize_t bytes);
 int blk_get_meta_cap(struct block_device *bdev, unsigned int cmd,
 		     struct logical_block_metadata_cap __user *argp);
+bool blk_rq_integrity_dma_map_iter_start(struct request *req,
+		struct device *dma_dev,  struct dma_iova_state *state,
+		struct blk_dma_iter *iter);
+bool blk_rq_integrity_dma_map_iter_next(struct request *req,
+		struct device *dma_dev, struct blk_dma_iter *iter);
 
 static inline bool
 blk_integrity_queue_supports_integrity(struct request_queue *q)
@@ -115,6 +121,17 @@ static inline int blk_rq_integrity_map_user(struct request *rq,
 {
 	return -EINVAL;
 }
+static inline bool blk_rq_integrity_dma_map_iter_start(struct request *req,
+		struct device *dma_dev,  struct dma_iova_state *state,
+		struct blk_dma_iter *iter)
+{
+	return false;
+}
+static inline bool blk_rq_integrity_dma_map_iter_next(struct request *req,
+		struct device *dma_dev, struct blk_dma_iter *iter)
+{
+	return false;
+}
 static inline struct blk_integrity *bdev_get_integrity(struct block_device *b)
 {
 	return NULL;
diff --git a/include/linux/blk-mq-dma.h b/include/linux/blk-mq-dma.h
index aef8d784952ff..5ff7f3ce9059a 100644
--- a/include/linux/blk-mq-dma.h
+++ b/include/linux/blk-mq-dma.h
@@ -11,6 +11,9 @@ struct blk_map_iter {
 	struct bio_vec                  *bvec;
 	struct bvec_iter		iter;
 	struct bio			*bio;
+#ifdef CONFIG_BLK_DEV_INTEGRITY
+	bool				is_integrity;
+#endif
 };
 
 struct blk_dma_iter {
-- 
2.47.3



^ permalink raw reply related	[flat|nested] 21+ messages in thread

* [PATCHv5 7/8] nvme-pci: create common sgl unmapping helper
  2025-08-08 15:58 [PATCHv5 0/8] blk dma iter for integrity metadata Keith Busch
                   ` (5 preceding siblings ...)
  2025-08-08 15:58 ` [PATCHv5 6/8] blk-mq-dma: add support for mapping integrity metadata Keith Busch
@ 2025-08-08 15:58 ` Keith Busch
  2025-08-10 14:21   ` Christoph Hellwig
  2025-08-08 15:58 ` [PATCHv5 8/8] nvme-pci: convert metadata mapping to dma iter Keith Busch
  7 siblings, 1 reply; 21+ messages in thread
From: Keith Busch @ 2025-08-08 15:58 UTC (permalink / raw)
  To: linux-block, linux-nvme; +Cc: hch, axboe, joshi.k, Keith Busch

From: Keith Busch <kbusch@kernel.org>

This can be reused by metadata sgls once that starts using the blk-mq
dma api.

Signed-off-by: Keith Busch <kbusch@kernel.org>
---
 drivers/nvme/host/pci.c | 31 ++++++++++++++++++++-----------
 1 file changed, 20 insertions(+), 11 deletions(-)

diff --git a/drivers/nvme/host/pci.c b/drivers/nvme/host/pci.c
index 111b6bc6c93eb..decb3ad1508a7 100644
--- a/drivers/nvme/host/pci.c
+++ b/drivers/nvme/host/pci.c
@@ -693,25 +693,34 @@ static void nvme_free_prps(struct request *req)
 	mempool_free(iod->dma_vecs, nvmeq->dev->dmavec_mempool);
 }
 
+
+static void __nvme_free_sgls(struct device *dma_dev, struct nvme_sgl_desc *sge,
+		struct nvme_sgl_desc *sg_list, enum dma_data_direction dir)
+{
+	unsigned int len = le32_to_cpu(sge->length);
+	unsigned int i, nr_entries;
+
+	if (sge->type == (NVME_SGL_FMT_DATA_DESC << 4)) {
+		dma_unmap_page(dma_dev, le64_to_cpu(sge->addr), len, dir);
+		return;
+	}
+
+	nr_entries = len / sizeof(*sg_list);
+	for (i = 0; i < nr_entries; i++)
+		dma_unmap_page(dma_dev, le64_to_cpu(sg_list[i].addr),
+			le32_to_cpu(sg_list[i].length), dir);
+}
+
 static void nvme_free_sgls(struct request *req)
 {
 	struct nvme_iod *iod = blk_mq_rq_to_pdu(req);
 	struct nvme_queue *nvmeq = req->mq_hctx->driver_data;
 	struct device *dma_dev = nvmeq->dev->dev;
-	dma_addr_t sqe_dma_addr = le64_to_cpu(iod->cmd.common.dptr.sgl.addr);
-	unsigned int sqe_dma_len = le32_to_cpu(iod->cmd.common.dptr.sgl.length);
 	struct nvme_sgl_desc *sg_list = iod->descriptors[0];
+	struct nvme_sgl_desc *sge = &iod->cmd.common.dptr.sgl;
 	enum dma_data_direction dir = rq_dma_dir(req);
 
-	if (iod->nr_descriptors) {
-		unsigned int nr_entries = sqe_dma_len / sizeof(*sg_list), i;
-
-		for (i = 0; i < nr_entries; i++)
-			dma_unmap_page(dma_dev, le64_to_cpu(sg_list[i].addr),
-				le32_to_cpu(sg_list[i].length), dir);
-	} else {
-		dma_unmap_page(dma_dev, sqe_dma_addr, sqe_dma_len, dir);
-	}
+	__nvme_free_sgls(dma_dev, sge, sg_list, dir);
 }
 
 static void nvme_unmap_data(struct request *req)
-- 
2.47.3



^ permalink raw reply related	[flat|nested] 21+ messages in thread

* [PATCHv5 8/8] nvme-pci: convert metadata mapping to dma iter
  2025-08-08 15:58 [PATCHv5 0/8] blk dma iter for integrity metadata Keith Busch
                   ` (6 preceding siblings ...)
  2025-08-08 15:58 ` [PATCHv5 7/8] nvme-pci: create common sgl unmapping helper Keith Busch
@ 2025-08-08 15:58 ` Keith Busch
  2025-08-10 14:27   ` Christoph Hellwig
  7 siblings, 1 reply; 21+ messages in thread
From: Keith Busch @ 2025-08-08 15:58 UTC (permalink / raw)
  To: linux-block, linux-nvme; +Cc: hch, axboe, joshi.k, Keith Busch

From: Keith Busch <kbusch@kernel.org>

Aligns data and metadata to the similar dma mapping scheme and removes
one more user of the scatter-gather dma mapping.

Signed-off-by: Keith Busch <kbusch@kernel.org>
---
 drivers/nvme/host/pci.c | 159 +++++++++++++++++++++-------------------
 1 file changed, 82 insertions(+), 77 deletions(-)

diff --git a/drivers/nvme/host/pci.c b/drivers/nvme/host/pci.c
index decb3ad1508a7..ab9d37d0e05dd 100644
--- a/drivers/nvme/host/pci.c
+++ b/drivers/nvme/host/pci.c
@@ -172,9 +172,7 @@ struct nvme_dev {
 	u32 last_ps;
 	bool hmb;
 	struct sg_table *hmb_sgt;
-
 	mempool_t *dmavec_mempool;
-	mempool_t *iod_meta_mempool;
 
 	/* shadow doorbell buffer support: */
 	__le32 *dbbuf_dbs;
@@ -264,6 +262,12 @@ enum nvme_iod_flags {
 
 	/* DMA mapped with PCI_P2PDMA_MAP_BUS_ADDR */
 	IOD_P2P_BUS_ADDR	= 1U << 3,
+
+	/* Metadata DMA mapped with PCI_P2PDMA_MAP_BUS_ADDR */
+	IOD_META_P2P_BUS_ADDR	= 1U << 4,
+
+	/* Metadata using non-coalesced MPTR */
+	IOD_META_MPTR		= 1U << 5,
 };
 
 struct nvme_dma_vec {
@@ -281,13 +285,14 @@ struct nvme_iod {
 	u8 nr_descriptors;
 
 	unsigned int total_len;
+	unsigned int meta_total_len;
 	struct dma_iova_state dma_state;
+	struct dma_iova_state meta_dma_state;
 	void *descriptors[NVME_MAX_NR_DESCRIPTORS];
 	struct nvme_dma_vec *dma_vecs;
 	unsigned int nr_dma_vecs;
 
 	dma_addr_t meta_dma;
-	struct sg_table meta_sgt;
 	struct nvme_sgl_desc *meta_descriptor;
 };
 
@@ -644,6 +649,11 @@ static inline struct dma_pool *nvme_dma_pool(struct nvme_queue *nvmeq,
 	return nvmeq->descriptor_pools.large;
 }
 
+static inline bool nvme_pci_cmd_use_meta_sgl(struct nvme_command *cmd)
+{
+	return (cmd->common.flags & NVME_CMD_SGL_ALL) == NVME_CMD_SGL_METASEG;
+}
+
 static inline bool nvme_pci_cmd_use_sgl(struct nvme_command *cmd)
 {
 	return cmd->common.flags &
@@ -711,6 +721,43 @@ static void __nvme_free_sgls(struct device *dma_dev, struct nvme_sgl_desc *sge,
 			le32_to_cpu(sg_list[i].length), dir);
 }
 
+static void nvme_free_meta_sgls(struct nvme_iod *iod, struct device *dma_dev,
+				enum dma_data_direction dir)
+{
+	struct nvme_sgl_desc *sge = iod->meta_descriptor;
+
+	__nvme_free_sgls(dma_dev, sge, &sge[1], dir);
+}
+
+static void nvme_unmap_metadata(struct request *req)
+{
+	struct nvme_queue *nvmeq = req->mq_hctx->driver_data;
+	enum dma_data_direction dir = rq_dma_dir(req);
+	struct nvme_iod *iod = blk_mq_rq_to_pdu(req);
+	struct device *dma_dev = nvmeq->dev->dev;
+
+	if (iod->flags & IOD_META_MPTR) {
+		dma_unmap_page(dma_dev, iod->meta_dma,
+			       rq_integrity_vec(req).bv_len,
+			       rq_dma_dir(req));
+		return;
+	}
+
+	if (!blk_rq_dma_unmap(req, dma_dev, &iod->meta_dma_state,
+				iod->meta_total_len,
+				iod->flags & IOD_META_P2P_BUS_ADDR)) {
+		if (nvme_pci_cmd_use_meta_sgl(&iod->cmd))
+			nvme_free_meta_sgls(iod, dma_dev, dir);
+		else
+			dma_unmap_page(dma_dev, iod->meta_dma,
+				       iod->meta_total_len, dir);
+	}
+
+	if (iod->meta_descriptor)
+		dma_pool_free(nvmeq->descriptor_pools.small,
+			      iod->meta_descriptor, iod->meta_dma);
+}
+
 static void nvme_free_sgls(struct request *req)
 {
 	struct nvme_iod *iod = blk_mq_rq_to_pdu(req);
@@ -1023,70 +1070,59 @@ static blk_status_t nvme_map_data(struct request *req)
 	return nvme_pci_setup_data_prp(req, &iter);
 }
 
-static void nvme_pci_sgl_set_data_sg(struct nvme_sgl_desc *sge,
-		struct scatterlist *sg)
-{
-	sge->addr = cpu_to_le64(sg_dma_address(sg));
-	sge->length = cpu_to_le32(sg_dma_len(sg));
-	sge->type = NVME_SGL_FMT_DATA_DESC << 4;
-}
-
 static blk_status_t nvme_pci_setup_meta_sgls(struct request *req)
 {
 	struct nvme_queue *nvmeq = req->mq_hctx->driver_data;
-	struct nvme_dev *dev = nvmeq->dev;
+	unsigned int entries = req->nr_integrity_segments;
 	struct nvme_iod *iod = blk_mq_rq_to_pdu(req);
+	struct nvme_dev *dev = nvmeq->dev;
 	struct nvme_sgl_desc *sg_list;
-	struct scatterlist *sgl, *sg;
-	unsigned int entries;
+	struct blk_dma_iter iter;
 	dma_addr_t sgl_dma;
-	int rc, i;
-
-	iod->meta_sgt.sgl = mempool_alloc(dev->iod_meta_mempool, GFP_ATOMIC);
-	if (!iod->meta_sgt.sgl)
-		return BLK_STS_RESOURCE;
+	int i = 0;
 
-	sg_init_table(iod->meta_sgt.sgl, req->nr_integrity_segments);
-	iod->meta_sgt.orig_nents = blk_rq_map_integrity_sg(req,
-							   iod->meta_sgt.sgl);
-	if (!iod->meta_sgt.orig_nents)
-		goto out_free_sg;
+	if (!blk_rq_integrity_dma_map_iter_start(req, dev->dev,
+						&iod->meta_dma_state, &iter))
+		return iter.status;
 
-	rc = dma_map_sgtable(dev->dev, &iod->meta_sgt, rq_dma_dir(req),
-			     DMA_ATTR_NO_WARN);
-	if (rc)
-		goto out_free_sg;
+	if (iter.p2pdma.map == PCI_P2PDMA_MAP_BUS_ADDR)
+		iod->flags |= IOD_META_P2P_BUS_ADDR;
+	else if (blk_rq_dma_map_coalesce(&iod->meta_dma_state))
+		entries = 1;
+
+	if (entries == 1 && !(nvme_req(req)->flags & NVME_REQ_USERCMD)) {
+		iod->cmd.common.metadata = cpu_to_le64(iter.addr);
+		iod->meta_total_len = iter.len;
+		iod->meta_dma = iter.addr;
+		iod->meta_descriptor = NULL;
+		return BLK_STS_OK;
+	}
 
 	sg_list = dma_pool_alloc(nvmeq->descriptor_pools.small, GFP_ATOMIC,
 			&sgl_dma);
 	if (!sg_list)
-		goto out_unmap_sg;
+		return BLK_STS_RESOURCE;
 
-	entries = iod->meta_sgt.nents;
 	iod->meta_descriptor = sg_list;
 	iod->meta_dma = sgl_dma;
-
 	iod->cmd.common.flags = NVME_CMD_SGL_METASEG;
 	iod->cmd.common.metadata = cpu_to_le64(sgl_dma);
-
-	sgl = iod->meta_sgt.sgl;
 	if (entries == 1) {
-		nvme_pci_sgl_set_data_sg(sg_list, sgl);
+		iod->meta_total_len = iter.len;
+		nvme_pci_sgl_set_data(sg_list, &iter);
 		return BLK_STS_OK;
 	}
 
 	sgl_dma += sizeof(*sg_list);
-	nvme_pci_sgl_set_seg(sg_list, sgl_dma, entries);
-	for_each_sg(sgl, sg, entries, i)
-		nvme_pci_sgl_set_data_sg(&sg_list[i + 1], sg);
-
-	return BLK_STS_OK;
+	do {
+		nvme_pci_sgl_set_data(&sg_list[++i], &iter);
+		iod->meta_total_len += iter.len;
+	} while (blk_rq_integrity_dma_map_iter_next(req, dev->dev, &iter));
 
-out_unmap_sg:
-	dma_unmap_sgtable(dev->dev, &iod->meta_sgt, rq_dma_dir(req), 0);
-out_free_sg:
-	mempool_free(iod->meta_sgt.sgl, dev->iod_meta_mempool);
-	return BLK_STS_RESOURCE;
+	nvme_pci_sgl_set_seg(sg_list, sgl_dma, i);
+	if (unlikely(iter.status))
+		nvme_unmap_metadata(req);
+	return iter.status;
 }
 
 static blk_status_t nvme_pci_setup_meta_mptr(struct request *req)
@@ -1099,6 +1135,7 @@ static blk_status_t nvme_pci_setup_meta_mptr(struct request *req)
 	if (dma_mapping_error(nvmeq->dev->dev, iod->meta_dma))
 		return BLK_STS_IOERR;
 	iod->cmd.common.metadata = cpu_to_le64(iod->meta_dma);
+	iod->flags |= IOD_META_MPTR;
 	return BLK_STS_OK;
 }
 
@@ -1120,7 +1157,7 @@ static blk_status_t nvme_prep_rq(struct request *req)
 	iod->flags = 0;
 	iod->nr_descriptors = 0;
 	iod->total_len = 0;
-	iod->meta_sgt.nents = 0;
+	iod->meta_total_len = 0;
 
 	ret = nvme_setup_cmd(req->q->queuedata, req);
 	if (ret)
@@ -1231,25 +1268,6 @@ static void nvme_queue_rqs(struct rq_list *rqlist)
 	*rqlist = requeue_list;
 }
 
-static __always_inline void nvme_unmap_metadata(struct request *req)
-{
-	struct nvme_iod *iod = blk_mq_rq_to_pdu(req);
-	struct nvme_queue *nvmeq = req->mq_hctx->driver_data;
-	struct nvme_dev *dev = nvmeq->dev;
-
-	if (!iod->meta_sgt.nents) {
-		dma_unmap_page(dev->dev, iod->meta_dma,
-			       rq_integrity_vec(req).bv_len,
-			       rq_dma_dir(req));
-		return;
-	}
-
-	dma_pool_free(nvmeq->descriptor_pools.small, iod->meta_descriptor,
-			iod->meta_dma);
-	dma_unmap_sgtable(dev->dev, &iod->meta_sgt, rq_dma_dir(req), 0);
-	mempool_free(iod->meta_sgt.sgl, dev->iod_meta_mempool);
-}
-
 static __always_inline void nvme_pci_unmap_rq(struct request *req)
 {
 	if (blk_integrity_rq(req))
@@ -3055,7 +3073,6 @@ static int nvme_disable_prepare_reset(struct nvme_dev *dev, bool shutdown)
 
 static int nvme_pci_alloc_iod_mempool(struct nvme_dev *dev)
 {
-	size_t meta_size = sizeof(struct scatterlist) * (NVME_MAX_META_SEGS + 1);
 	size_t alloc_size = sizeof(struct nvme_dma_vec) * NVME_MAX_SEGS;
 
 	dev->dmavec_mempool = mempool_create_node(1,
@@ -3064,17 +3081,7 @@ static int nvme_pci_alloc_iod_mempool(struct nvme_dev *dev)
 			dev_to_node(dev->dev));
 	if (!dev->dmavec_mempool)
 		return -ENOMEM;
-
-	dev->iod_meta_mempool = mempool_create_node(1,
-			mempool_kmalloc, mempool_kfree,
-			(void *)meta_size, GFP_KERNEL,
-			dev_to_node(dev->dev));
-	if (!dev->iod_meta_mempool)
-		goto free;
 	return 0;
-free:
-	mempool_destroy(dev->dmavec_mempool);
-	return -ENOMEM;
 }
 
 static void nvme_free_tagset(struct nvme_dev *dev)
@@ -3524,7 +3531,6 @@ static int nvme_probe(struct pci_dev *pdev, const struct pci_device_id *id)
 	nvme_free_queues(dev, 0);
 out_release_iod_mempool:
 	mempool_destroy(dev->dmavec_mempool);
-	mempool_destroy(dev->iod_meta_mempool);
 out_dev_unmap:
 	nvme_dev_unmap(dev);
 out_uninit_ctrl:
@@ -3588,7 +3594,6 @@ static void nvme_remove(struct pci_dev *pdev)
 	nvme_dbbuf_dma_free(dev);
 	nvme_free_queues(dev, 0);
 	mempool_destroy(dev->dmavec_mempool);
-	mempool_destroy(dev->iod_meta_mempool);
 	nvme_release_descriptor_pools(dev);
 	nvme_dev_unmap(dev);
 	nvme_uninit_ctrl(&dev->ctrl);
-- 
2.47.3



^ permalink raw reply related	[flat|nested] 21+ messages in thread

* Re: [PATCHv5 1/8] blk-mq-dma: introduce blk_map_iter
  2025-08-08 15:58 ` [PATCHv5 1/8] blk-mq-dma: introduce blk_map_iter Keith Busch
@ 2025-08-10 14:04   ` Christoph Hellwig
  2025-08-11 13:30     ` Keith Busch
  0 siblings, 1 reply; 21+ messages in thread
From: Christoph Hellwig @ 2025-08-10 14:04 UTC (permalink / raw)
  To: Keith Busch; +Cc: linux-block, linux-nvme, hch, axboe, joshi.k, Keith Busch

> +struct blk_map_iter {
> +	phys_addr_t			paddr;
> +	u32				len;
> +	struct bvec_iter		iter;
> +	struct bio			*bio;
> +};

This now mixes the output previous in the phys_vec, and instead
of keeping it private to the implementation exposes it to all the
callers.  I could not find an explanation in the commit log, nor
something that makes use of it later in the series.

If possible I'd like to keep these separate and the output isolated
in blk-mq-dma.c.  But if there's a good reason to merge them, please
add it to the commit log, and also clearly document the usage of the
fields in the (public) structure.  Especially the len member could
very easily confuse.



^ permalink raw reply	[flat|nested] 21+ messages in thread

* Re: [PATCHv5 2/8] blk-mq-dma: provide the bio_vec list being iterated
  2025-08-08 15:58 ` [PATCHv5 2/8] blk-mq-dma: provide the bio_vec list being iterated Keith Busch
@ 2025-08-10 14:07   ` Christoph Hellwig
  2025-08-11 17:04     ` Keith Busch
  2025-08-10 14:09   ` Christoph Hellwig
  1 sibling, 1 reply; 21+ messages in thread
From: Christoph Hellwig @ 2025-08-10 14:07 UTC (permalink / raw)
  To: Keith Busch; +Cc: linux-block, linux-nvme, hch, axboe, joshi.k, Keith Busch

On Fri, Aug 08, 2025 at 08:58:20AM -0700, Keith Busch wrote:
> From: Keith Busch <kbusch@kernel.org>
> 
> This will make it easier to add different sources of the bvec table,
> like for upcoming integrity support, rather than assume to use the bio's
> bi_io_vec. It also makes iterating "special" payloads more in common
> with iterating normal payloads.

I would call the array a table (or maybe array) and not a list.

> +static struct blk_map_iter blk_rq_map_iter(struct request *rq)
> +{
> +	struct bio *bio = rq->bio;
> +
> +	if (rq->rq_flags & RQF_SPECIAL_PAYLOAD) {
> +		return (struct blk_map_iter) {
> +			.bvec = &rq->special_vec,
> +			.iter = {
> +				.bi_size = rq->special_vec.bv_len,
> +			}
> +		};

These large struct returns generate really horrible code if they aren't
inlined (although that might happen here).  I also find them not very
nice to read.  Any reason to just pass a pointer and initialize the
needed fields?

Also this function probably should be named blk_rq_map_iter_start or
blk_rq_map_iter_init as it only is used for the very first iteration.



^ permalink raw reply	[flat|nested] 21+ messages in thread

* Re: [PATCHv5 3/8] blk-mq-dma: require unmap caller provide p2p map type
  2025-08-08 15:58 ` [PATCHv5 3/8] blk-mq-dma: require unmap caller provide p2p map type Keith Busch
@ 2025-08-10 14:08   ` Christoph Hellwig
  0 siblings, 0 replies; 21+ messages in thread
From: Christoph Hellwig @ 2025-08-10 14:08 UTC (permalink / raw)
  To: Keith Busch; +Cc: linux-block, linux-nvme, hch, axboe, joshi.k, Keith Busch

On Fri, Aug 08, 2025 at 08:58:21AM -0700, Keith Busch wrote:
> From: Keith Busch <kbusch@kernel.org>
> 
> In preparing for integrity dma mappings, we can't rely on the request
> flag because data and metadata may have different mapping types.

Looks good:

Reviewed-by: Christoph Hellwig <hch@lst.de>



^ permalink raw reply	[flat|nested] 21+ messages in thread

* Re: [PATCHv5 4/8] blk-mq: remove REQ_P2PDMA flag
  2025-08-08 15:58 ` [PATCHv5 4/8] blk-mq: remove REQ_P2PDMA flag Keith Busch
@ 2025-08-10 14:08   ` Christoph Hellwig
  0 siblings, 0 replies; 21+ messages in thread
From: Christoph Hellwig @ 2025-08-10 14:08 UTC (permalink / raw)
  To: Keith Busch; +Cc: linux-block, linux-nvme, hch, axboe, joshi.k, Keith Busch

On Fri, Aug 08, 2025 at 08:58:22AM -0700, Keith Busch wrote:
> From: Keith Busch <kbusch@kernel.org>
> 
> It's not serving any particular purpose. pci_p2pdma_state() already has
> all the appropriate checks, so the config and flag checks are not
> guarding anything.

Looks good:

Reviewed-by: Christoph Hellwig <hch@lst.de>



^ permalink raw reply	[flat|nested] 21+ messages in thread

* Re: [PATCHv5 2/8] blk-mq-dma: provide the bio_vec list being iterated
  2025-08-08 15:58 ` [PATCHv5 2/8] blk-mq-dma: provide the bio_vec list being iterated Keith Busch
  2025-08-10 14:07   ` Christoph Hellwig
@ 2025-08-10 14:09   ` Christoph Hellwig
  1 sibling, 0 replies; 21+ messages in thread
From: Christoph Hellwig @ 2025-08-10 14:09 UTC (permalink / raw)
  To: Keith Busch; +Cc: linux-block, linux-nvme, hch, axboe, joshi.k, Keith Busch

On Fri, Aug 08, 2025 at 08:58:20AM -0700, Keith Busch wrote:
> +	struct bio_vec                  *bvec;

.. and another thing.  bvec feels a bit confusing as the pointer is not
to the current bvec, but the base bvec table and the name should
express that some how.  Maybe just bvecs for the shortest possible
way to kinda express that?



^ permalink raw reply	[flat|nested] 21+ messages in thread

* Re: [PATCHv5 5/8] blk-mq-dma: move common dma start code to a helper
  2025-08-08 15:58 ` [PATCHv5 5/8] blk-mq-dma: move common dma start code to a helper Keith Busch
@ 2025-08-10 14:10   ` Christoph Hellwig
  0 siblings, 0 replies; 21+ messages in thread
From: Christoph Hellwig @ 2025-08-10 14:10 UTC (permalink / raw)
  To: Keith Busch; +Cc: linux-block, linux-nvme, hch, axboe, joshi.k, Keith Busch

Looks good:

Reviewed-by: Christoph Hellwig <hch@lst.de>



^ permalink raw reply	[flat|nested] 21+ messages in thread

* Re: [PATCHv5 6/8] blk-mq-dma: add support for mapping integrity metadata
  2025-08-08 15:58 ` [PATCHv5 6/8] blk-mq-dma: add support for mapping integrity metadata Keith Busch
@ 2025-08-10 14:16   ` Christoph Hellwig
  0 siblings, 0 replies; 21+ messages in thread
From: Christoph Hellwig @ 2025-08-10 14:16 UTC (permalink / raw)
  To: Keith Busch; +Cc: linux-block, linux-nvme, hch, axboe, joshi.k, Keith Busch

On Fri, Aug 08, 2025 at 08:58:24AM -0700, Keith Busch wrote:
> From: Keith Busch <kbusch@kernel.org>
> 
> Provide integrity metadata helpers equivalent to the data payload
> helpers for iterating a request for dma setup.

Can you please also convert the SG mapping helpers to use the low-level
iterator first like I've done for the data path?  That ensures we have
less code to maintain, common behavior and also smaller kernel binaries.

> +static bool __blk_map_iter_next(struct blk_map_iter *iter)
> +{
> +	if (iter->iter.bi_size)
> +		return true;
> +	if (!iter->bio || !iter->bio->bi_next)
> +		return false;
> +
> +	iter->bio = iter->bio->bi_next;
> +#ifdef CONFIG_BLK_DEV_INTEGRITY
> +	if (iter->is_integrity) {
> +		iter->iter = iter->bio->bi_integrity->bip_iter;
> +		iter->bvec = iter->bio->bi_integrity->bip_vec;
> +		return true;
> +	}
> +#endif
> +	iter->iter = iter->bio->bi_iter;
> +	iter->bvec = iter->bio->bi_io_vec;
> +	return true;

I wonder if we should use the bio_integrity() that would introduce two
(probably optimized down to one by the compiler) extra branches for the 
integrity mapping that are easily predicted, but make the think look much
nicer as it would kill the ifdef and the ugly structure around it:

	if (iter->is_integrity) {
		iter->iter = bio_integrity(iter->bio)->bip_iter;
		iter->bvec = bio_integrity(iter->bio)->bip_vec;
	} else {
		iter->iter = iter->bio->bi_iter;
		iter->bvec = iter->bio->bi_io_vec;
	}

	return true;

> +#ifdef CONFIG_BLK_DEV_INTEGRITY
> +bool blk_rq_integrity_dma_map_iter_start(struct request *req,
> +		struct device *dma_dev,  struct dma_iova_state *state,
> +		struct blk_dma_iter *iter)

Can you please add a kerneldoc comment here, which could be easily
adapter from the data mapping path.

> +bool blk_rq_integrity_dma_map_iter_next(struct request *req,
> +               struct device *dma_dev, struct blk_dma_iter *iter)

Same here.



^ permalink raw reply	[flat|nested] 21+ messages in thread

* Re: [PATCHv5 7/8] nvme-pci: create common sgl unmapping helper
  2025-08-08 15:58 ` [PATCHv5 7/8] nvme-pci: create common sgl unmapping helper Keith Busch
@ 2025-08-10 14:21   ` Christoph Hellwig
  0 siblings, 0 replies; 21+ messages in thread
From: Christoph Hellwig @ 2025-08-10 14:21 UTC (permalink / raw)
  To: Keith Busch; +Cc: linux-block, linux-nvme, hch, axboe, joshi.k, Keith Busch

On Fri, Aug 08, 2025 at 08:58:25AM -0700, Keith Busch wrote:
> From: Keith Busch <kbusch@kernel.org>
> 
> This can be reused by metadata sgls once that starts using the blk-mq
> dma api.
> 
> Signed-off-by: Keith Busch <kbusch@kernel.org>
> ---
>  drivers/nvme/host/pci.c | 31 ++++++++++++++++++++-----------
>  1 file changed, 20 insertions(+), 11 deletions(-)

> +static void __nvme_free_sgls(struct device *dma_dev, struct nvme_sgl_desc *sge,
> +		struct nvme_sgl_desc *sg_list, enum dma_data_direction dir)
> +{
> +	unsigned int len = le32_to_cpu(sge->length);
> +	unsigned int i, nr_entries;
> +
> +	if (sge->type == (NVME_SGL_FMT_DATA_DESC << 4)) {
> +		dma_unmap_page(dma_dev, le64_to_cpu(sge->addr), len, dir);
> +		return;
> +	}
> +
> +	nr_entries = len / sizeof(*sg_list);
> +	for (i = 0; i < nr_entries; i++)


We can probably just do away with the nr_entries variable, the compiler
is not going to recompute this for every loop ieration.


>  {
>  	struct nvme_iod *iod = blk_mq_rq_to_pdu(req);
>  	struct nvme_queue *nvmeq = req->mq_hctx->driver_data;
>  	struct device *dma_dev = nvmeq->dev->dev;
>  	struct nvme_sgl_desc *sg_list = iod->descriptors[0];
>  	enum dma_data_direction dir = rq_dma_dir(req);
>  
> +	__nvme_free_sgls(dma_dev, sge, sg_list, dir);

Shouldn't we move the calculation of nvmeq, dma_dev and dir into
__nvme_free_sgls as they are going to be the same for data and metadata.
And then maybe rename it to __nvme_free_sgls and just opencode the
iod->descriptors[0] and &iod->cmd.common.dptr.sgl dereferences in
nvme_unmap_data?



^ permalink raw reply	[flat|nested] 21+ messages in thread

* Re: [PATCHv5 8/8] nvme-pci: convert metadata mapping to dma iter
  2025-08-08 15:58 ` [PATCHv5 8/8] nvme-pci: convert metadata mapping to dma iter Keith Busch
@ 2025-08-10 14:27   ` Christoph Hellwig
  0 siblings, 0 replies; 21+ messages in thread
From: Christoph Hellwig @ 2025-08-10 14:27 UTC (permalink / raw)
  To: Keith Busch; +Cc: linux-block, linux-nvme, hch, axboe, joshi.k, Keith Busch

On Fri, Aug 08, 2025 at 08:58:26AM -0700, Keith Busch wrote:
>  
>  struct nvme_dma_vec {
> @@ -281,13 +285,14 @@ struct nvme_iod {
>  	u8 nr_descriptors;
>  
>  	unsigned int total_len;
> +	unsigned int meta_total_len;
>  	struct dma_iova_state dma_state;
> +	struct dma_iova_state meta_dma_state;
>  	void *descriptors[NVME_MAX_NR_DESCRIPTORS];
>  	struct nvme_dma_vec *dma_vecs;
>  	unsigned int nr_dma_vecs;
>  
>  	dma_addr_t meta_dma;
> -	struct sg_table meta_sgt;
>  	struct nvme_sgl_desc *meta_descriptor;

Maybe keep the meta fields together as much as we can to ensure they
are in the same cacheline(s)?

> +static void nvme_unmap_metadata(struct request *req)
> +{
> +	struct nvme_queue *nvmeq = req->mq_hctx->driver_data;
> +	enum dma_data_direction dir = rq_dma_dir(req);
> +	struct nvme_iod *iod = blk_mq_rq_to_pdu(req);
> +	struct device *dma_dev = nvmeq->dev->dev;
> +
> +	if (iod->flags & IOD_META_MPTR) {
> +		dma_unmap_page(dma_dev, iod->meta_dma,
> +			       rq_integrity_vec(req).bv_len,
> +			       rq_dma_dir(req));
> +		return;
> +	}
> +
> +	if (!blk_rq_dma_unmap(req, dma_dev, &iod->meta_dma_state,
> +				iod->meta_total_len,
> +				iod->flags & IOD_META_P2P_BUS_ADDR)) {
> +		if (nvme_pci_cmd_use_meta_sgl(&iod->cmd))
> +			nvme_free_meta_sgls(iod, dma_dev, dir);
> +		else
> +			dma_unmap_page(dma_dev, iod->meta_dma,
> +				       iod->meta_total_len, dir);
> +	}

IOD_META_MPTR above really should be named IOD_SINGLE_META_SEGMENT as
it's all about avoiding the dma iterator, which could also create a
single segment case handled just above.

>  static blk_status_t nvme_pci_setup_meta_sgls(struct request *req)
>  {
>  	struct nvme_queue *nvmeq = req->mq_hctx->driver_data;
> +	unsigned int entries = req->nr_integrity_segments;
>  	struct nvme_iod *iod = blk_mq_rq_to_pdu(req);
> +	struct nvme_dev *dev = nvmeq->dev;
>  	struct nvme_sgl_desc *sg_list;
> +	struct blk_dma_iter iter;
>  	dma_addr_t sgl_dma;
> +	int i = 0;
>  
> +	if (!blk_rq_integrity_dma_map_iter_start(req, dev->dev,
> +						&iod->meta_dma_state, &iter))
> +		return iter.status;
>  
> +	if (iter.p2pdma.map == PCI_P2PDMA_MAP_BUS_ADDR)
> +		iod->flags |= IOD_META_P2P_BUS_ADDR;
> +	else if (blk_rq_dma_map_coalesce(&iod->meta_dma_state))
> +		entries = 1;
> +
> +	if (entries == 1 && !(nvme_req(req)->flags & NVME_REQ_USERCMD)) {
> +		iod->cmd.common.metadata = cpu_to_le64(iter.addr);
> +		iod->meta_total_len = iter.len;
> +		iod->meta_dma = iter.addr;
> +		iod->meta_descriptor = NULL;
> +		return BLK_STS_OK;

Maybe throw in a comment explaining that we fall back to a single metadata
pointer here if we can, and why we don't for passthrough requests?



^ permalink raw reply	[flat|nested] 21+ messages in thread

* Re: [PATCHv5 1/8] blk-mq-dma: introduce blk_map_iter
  2025-08-10 14:04   ` Christoph Hellwig
@ 2025-08-11 13:30     ` Keith Busch
  2025-08-11 14:05       ` Christoph Hellwig
  0 siblings, 1 reply; 21+ messages in thread
From: Keith Busch @ 2025-08-11 13:30 UTC (permalink / raw)
  To: Christoph Hellwig; +Cc: Keith Busch, linux-block, linux-nvme, axboe, joshi.k

On Sun, Aug 10, 2025 at 04:04:48PM +0200, Christoph Hellwig wrote:
> > +struct blk_map_iter {
> > +	phys_addr_t			paddr;
> > +	u32				len;
> > +	struct bvec_iter		iter;
> > +	struct bio			*bio;
> > +};
> 
> This now mixes the output previous in the phys_vec, and instead
> of keeping it private to the implementation exposes it to all the
> callers.  I could not find an explanation in the commit log, nor
> something that makes use of it later in the series.
> 
> If possible I'd like to keep these separate and the output isolated
> in blk-mq-dma.c.  But if there's a good reason to merge them, please
> add it to the commit log, and also clearly document the usage of the
> fields in the (public) structure.  Especially the len member could
> very easily confuse.

Perhaps I misunderstood the assignment:

  https://lore.kernel.org/linux-block/20250722055339.GB13634@lst.de/

I thought you were saying you wanted the lower (bvec pages -> phys
addrs) and upper part (phys -> dma) available as an API rather than
keeping the lower part private to blk-dma. This patch helps move towards
that, and in the next patch provides a common place to stash the bvec
array that's being iterated.


^ permalink raw reply	[flat|nested] 21+ messages in thread

* Re: [PATCHv5 1/8] blk-mq-dma: introduce blk_map_iter
  2025-08-11 13:30     ` Keith Busch
@ 2025-08-11 14:05       ` Christoph Hellwig
  0 siblings, 0 replies; 21+ messages in thread
From: Christoph Hellwig @ 2025-08-11 14:05 UTC (permalink / raw)
  To: Keith Busch
  Cc: Christoph Hellwig, Keith Busch, linux-block, linux-nvme, axboe,
	joshi.k

On Mon, Aug 11, 2025 at 07:30:53AM -0600, Keith Busch wrote:
> Perhaps I misunderstood the assignment:
> 
>   https://lore.kernel.org/linux-block/20250722055339.GB13634@lst.de/
> 
> I thought you were saying you wanted the lower (bvec pages -> phys
> addrs) and upper part (phys -> dma) available as an API rather than
> keeping the lower part private to blk-dma. This patch helps move towards
> that, and in the next patch provides a common place to stash the bvec
> array that's being iterated.

Eventually we might want to make it public, yes.  But that works just
fine with the phys_vec.  In fact Leon has a series outstanding currently
that makes struct phys_vec public without any other changes to this
code.


^ permalink raw reply	[flat|nested] 21+ messages in thread

* Re: [PATCHv5 2/8] blk-mq-dma: provide the bio_vec list being iterated
  2025-08-10 14:07   ` Christoph Hellwig
@ 2025-08-11 17:04     ` Keith Busch
  0 siblings, 0 replies; 21+ messages in thread
From: Keith Busch @ 2025-08-11 17:04 UTC (permalink / raw)
  To: Christoph Hellwig; +Cc: Keith Busch, linux-block, linux-nvme, axboe, joshi.k

On Sun, Aug 10, 2025 at 04:07:47PM +0200, Christoph Hellwig wrote:
> On Fri, Aug 08, 2025 at 08:58:20AM -0700, Keith Busch wrote:
> > +static struct blk_map_iter blk_rq_map_iter(struct request *rq)
> > +{
> > +	struct bio *bio = rq->bio;
> > +
> > +	if (rq->rq_flags & RQF_SPECIAL_PAYLOAD) {
> > +		return (struct blk_map_iter) {
> > +			.bvec = &rq->special_vec,
> > +			.iter = {
> > +				.bi_size = rq->special_vec.bv_len,
> > +			}
> > +		};
> 
> These large struct returns generate really horrible code if they aren't
> inlined (although that might happen here).  I also find them not very
> nice to read.  Any reason to just pass a pointer and initialize the
> needed fields?

I initially set out to make a macro, inpsired by other block iterator
setups like "bvec_iter_bvec", but I thought the extra cases to handle
was better implemented as an inline function. I am definitely counting
on this being inlined to produce good code, so I should have annotated
that. No problem with switching to take a pointer, but I doubt the
resulting assembly is better either way.


^ permalink raw reply	[flat|nested] 21+ messages in thread

end of thread, other threads:[~2025-08-11 22:03 UTC | newest]

Thread overview: 21+ messages (download: mbox.gz follow: Atom feed
-- links below jump to the message on this page --
2025-08-08 15:58 [PATCHv5 0/8] blk dma iter for integrity metadata Keith Busch
2025-08-08 15:58 ` [PATCHv5 1/8] blk-mq-dma: introduce blk_map_iter Keith Busch
2025-08-10 14:04   ` Christoph Hellwig
2025-08-11 13:30     ` Keith Busch
2025-08-11 14:05       ` Christoph Hellwig
2025-08-08 15:58 ` [PATCHv5 2/8] blk-mq-dma: provide the bio_vec list being iterated Keith Busch
2025-08-10 14:07   ` Christoph Hellwig
2025-08-11 17:04     ` Keith Busch
2025-08-10 14:09   ` Christoph Hellwig
2025-08-08 15:58 ` [PATCHv5 3/8] blk-mq-dma: require unmap caller provide p2p map type Keith Busch
2025-08-10 14:08   ` Christoph Hellwig
2025-08-08 15:58 ` [PATCHv5 4/8] blk-mq: remove REQ_P2PDMA flag Keith Busch
2025-08-10 14:08   ` Christoph Hellwig
2025-08-08 15:58 ` [PATCHv5 5/8] blk-mq-dma: move common dma start code to a helper Keith Busch
2025-08-10 14:10   ` Christoph Hellwig
2025-08-08 15:58 ` [PATCHv5 6/8] blk-mq-dma: add support for mapping integrity metadata Keith Busch
2025-08-10 14:16   ` Christoph Hellwig
2025-08-08 15:58 ` [PATCHv5 7/8] nvme-pci: create common sgl unmapping helper Keith Busch
2025-08-10 14:21   ` Christoph Hellwig
2025-08-08 15:58 ` [PATCHv5 8/8] nvme-pci: convert metadata mapping to dma iter Keith Busch
2025-08-10 14:27   ` Christoph Hellwig

This is a public inbox, see mirroring instructions
for how to clone and mirror all data and code used for this inbox;
as well as URLs for NNTP newsgroup(s).