Linux block layer
 help / color / mirror / Atom feed
* [PATCHv3 0/7] blk dma iter for integrity metadata
@ 2025-07-29 14:34 Keith Busch
  2025-07-29 14:34 ` [PATCHv3 1/7] blk-mq: introduce blk_map_iter Keith Busch
                   ` (6 more replies)
  0 siblings, 7 replies; 16+ messages in thread
From: Keith Busch @ 2025-07-29 14:34 UTC (permalink / raw)
  To: linux-block, linux-nvme, hch; +Cc: axboe, leonro, Keith Busch

From: Keith Busch <kbusch@kernel.org>

Previous version:

  https://lore.kernel.org/linux-nvme/20250720184040.2402790-1-kbusch@meta.com/

Changes from v2:

  - introduce the "blk_map_iter" type for the lower level's physical
    address mapping.

 - fixed missing "static inline" for stub functions

 - appened "is_" prefixes to bool types

 - nvme uses the MPTR method when it's a single entry, execpt for user
   commands, which will always use SGL when possible. This nicely
   unifies the setup and teardown for each, too.

Keith Busch (7):
  blk-mq: introduce blk_map_iter
  blk-mq-dma: provide the bio_vec list being iterated
  blk-mq-dma: require unmap caller provide p2p map type
  blk-mq: remove REQ_P2PDMA flag
  blk-mq-dma: move common dma start code to a helper
  blk-mq-dma: add support for mapping integrity metadata
  nvme: convert metadata mapping to dma iter

 block/bio.c                   |   2 +-
 block/blk-mq-dma.c            | 226 ++++++++++++++++++++--------------
 drivers/nvme/host/pci.c       | 181 +++++++++++++--------------
 include/linux/blk-integrity.h |  17 +++
 include/linux/blk-mq-dma.h    |  16 ++-
 include/linux/blk_types.h     |   1 -
 6 files changed, 253 insertions(+), 190 deletions(-)

-- 
2.47.3


^ permalink raw reply	[flat|nested] 16+ messages in thread

* [PATCHv3 1/7] blk-mq: introduce blk_map_iter
  2025-07-29 14:34 [PATCHv3 0/7] blk dma iter for integrity metadata Keith Busch
@ 2025-07-29 14:34 ` Keith Busch
  2025-07-30  6:52   ` Kanchan Joshi
  2025-07-30  8:18   ` Kanchan Joshi
  2025-07-29 14:34 ` [PATCHv3 2/7] blk-mq-dma: provide the bio_vec list being iterated Keith Busch
                   ` (5 subsequent siblings)
  6 siblings, 2 replies; 16+ messages in thread
From: Keith Busch @ 2025-07-29 14:34 UTC (permalink / raw)
  To: linux-block, linux-nvme, hch; +Cc: axboe, leonro, Keith Busch

From: Keith Busch <kbusch@kernel.org>

Create a type that fully captures the lower level physical address
iteration.

Signed-off-by: Keith Busch <kbusch@kernel.org>
---
 block/blk-mq-dma.c         | 93 +++++++++++++++++---------------------
 include/linux/blk-mq-dma.h |  9 +++-
 2 files changed, 50 insertions(+), 52 deletions(-)

diff --git a/block/blk-mq-dma.c b/block/blk-mq-dma.c
index ad283017caef2..61fbdb715220f 100644
--- a/block/blk-mq-dma.c
+++ b/block/blk-mq-dma.c
@@ -5,13 +5,7 @@
 #include <linux/blk-mq-dma.h>
 #include "blk.h"
 
-struct phys_vec {
-	phys_addr_t	paddr;
-	u32		len;
-};
-
-static bool blk_map_iter_next(struct request *req, struct req_iterator *iter,
-			      struct phys_vec *vec)
+static bool blk_map_iter_next(struct request *req, struct blk_map_iter *iter)
 {
 	unsigned int max_size;
 	struct bio_vec bv;
@@ -19,8 +13,8 @@ static bool blk_map_iter_next(struct request *req, struct req_iterator *iter,
 	if (req->rq_flags & RQF_SPECIAL_PAYLOAD) {
 		if (!iter->bio)
 			return false;
-		vec->paddr = bvec_phys(&req->special_vec);
-		vec->len = req->special_vec.bv_len;
+		iter->paddr = bvec_phys(&req->special_vec);
+		iter->len = req->special_vec.bv_len;
 		iter->bio = NULL;
 		return true;
 	}
@@ -29,8 +23,8 @@ static bool blk_map_iter_next(struct request *req, struct req_iterator *iter,
 		return false;
 
 	bv = mp_bvec_iter_bvec(iter->bio->bi_io_vec, iter->iter);
-	vec->paddr = bvec_phys(&bv);
-	max_size = get_max_segment_size(&req->q->limits, vec->paddr, UINT_MAX);
+	iter->paddr = bvec_phys(&bv);
+	max_size = get_max_segment_size(&req->q->limits, iter->paddr, UINT_MAX);
 	bv.bv_len = min(bv.bv_len, max_size);
 	bio_advance_iter_single(iter->bio, &iter->iter, bv.bv_len);
 
@@ -39,12 +33,11 @@ static bool blk_map_iter_next(struct request *req, struct req_iterator *iter,
 	 * one could be merged into it.  This typically happens when moving to
 	 * the next bio, but some callers also don't pack bvecs tight.
 	 */
-	while (!iter->iter.bi_size || !iter->iter.bi_bvec_done) {
+	while (!iter->iter.bi_size ||
+	       (!iter->iter.bi_bvec_done && iter->bio->bi_next)) {
 		struct bio_vec next;
 
 		if (!iter->iter.bi_size) {
-			if (!iter->bio->bi_next)
-				break;
 			iter->bio = iter->bio->bi_next;
 			iter->iter = iter->bio->bi_iter;
 		}
@@ -58,7 +51,7 @@ static bool blk_map_iter_next(struct request *req, struct req_iterator *iter,
 		bio_advance_iter_single(iter->bio, &iter->iter, next.bv_len);
 	}
 
-	vec->len = bv.bv_len;
+	iter->len = bv.bv_len;
 	return true;
 }
 
@@ -77,29 +70,29 @@ static inline bool blk_can_dma_map_iova(struct request *req,
 		dma_get_merge_boundary(dma_dev));
 }
 
-static bool blk_dma_map_bus(struct blk_dma_iter *iter, struct phys_vec *vec)
+static bool blk_dma_map_bus(struct blk_dma_iter *iter)
 {
-	iter->addr = pci_p2pdma_bus_addr_map(&iter->p2pdma, vec->paddr);
-	iter->len = vec->len;
+	iter->addr = pci_p2pdma_bus_addr_map(&iter->p2pdma, iter->iter.paddr);
+	iter->len = iter->iter.len;
 	return true;
 }
 
 static bool blk_dma_map_direct(struct request *req, struct device *dma_dev,
-		struct blk_dma_iter *iter, struct phys_vec *vec)
+		struct blk_dma_iter *iter)
 {
-	iter->addr = dma_map_page(dma_dev, phys_to_page(vec->paddr),
-			offset_in_page(vec->paddr), vec->len, rq_dma_dir(req));
-	if (dma_mapping_error(dma_dev, iter->addr)) {
+	iter->addr = dma_map_page(dma_dev, phys_to_page(iter->iter.paddr),
+			offset_in_page(iter->iter.paddr), iter->iter.len,
+			rq_dma_dir(req));
+	if (dma_mapping_error(dma_dev, iter->iter.paddr)) {
 		iter->status = BLK_STS_RESOURCE;
 		return false;
 	}
-	iter->len = vec->len;
+	iter->len = iter->iter.len;
 	return true;
 }
 
 static bool blk_rq_dma_map_iova(struct request *req, struct device *dma_dev,
-		struct dma_iova_state *state, struct blk_dma_iter *iter,
-		struct phys_vec *vec)
+		struct dma_iova_state *state, struct blk_dma_iter *iter)
 {
 	enum dma_data_direction dir = rq_dma_dir(req);
 	unsigned int mapped = 0;
@@ -109,12 +102,12 @@ static bool blk_rq_dma_map_iova(struct request *req, struct device *dma_dev,
 	iter->len = dma_iova_size(state);
 
 	do {
-		error = dma_iova_link(dma_dev, state, vec->paddr, mapped,
-				vec->len, dir, 0);
+		error = dma_iova_link(dma_dev, state, iter->iter.paddr, mapped,
+				iter->iter.len, dir, 0);
 		if (error)
 			break;
-		mapped += vec->len;
-	} while (blk_map_iter_next(req, &iter->iter, vec));
+		mapped += iter->iter.len;
+	} while (blk_map_iter_next(req, &iter->iter));
 
 	error = dma_iova_sync(dma_dev, state, 0, mapped);
 	if (error) {
@@ -151,10 +144,10 @@ bool blk_rq_dma_map_iter_start(struct request *req, struct device *dma_dev,
 		struct dma_iova_state *state, struct blk_dma_iter *iter)
 {
 	unsigned int total_len = blk_rq_payload_bytes(req);
-	struct phys_vec vec;
+	struct blk_map_iter *map_iter = &iter->iter;
 
-	iter->iter.bio = req->bio;
-	iter->iter.iter = req->bio->bi_iter;
+	map_iter->bio = req->bio;
+	map_iter->iter = req->bio->bi_iter;
 	memset(&iter->p2pdma, 0, sizeof(iter->p2pdma));
 	iter->status = BLK_STS_OK;
 
@@ -162,14 +155,14 @@ bool blk_rq_dma_map_iter_start(struct request *req, struct device *dma_dev,
 	 * Grab the first segment ASAP because we'll need it to check for P2P
 	 * transfers.
 	 */
-	if (!blk_map_iter_next(req, &iter->iter, &vec))
+	if (!blk_map_iter_next(req, map_iter))
 		return false;
 
 	if (IS_ENABLED(CONFIG_PCI_P2PDMA) && (req->cmd_flags & REQ_P2PDMA)) {
 		switch (pci_p2pdma_state(&iter->p2pdma, dma_dev,
-					 phys_to_page(vec.paddr))) {
+					 phys_to_page(map_iter->paddr))) {
 		case PCI_P2PDMA_MAP_BUS_ADDR:
-			return blk_dma_map_bus(iter, &vec);
+			return blk_dma_map_bus(iter);
 		case PCI_P2PDMA_MAP_THRU_HOST_BRIDGE:
 			/*
 			 * P2P transfers through the host bridge are treated the
@@ -184,9 +177,9 @@ bool blk_rq_dma_map_iter_start(struct request *req, struct device *dma_dev,
 	}
 
 	if (blk_can_dma_map_iova(req, dma_dev) &&
-	    dma_iova_try_alloc(dma_dev, state, vec.paddr, total_len))
-		return blk_rq_dma_map_iova(req, dma_dev, state, iter, &vec);
-	return blk_dma_map_direct(req, dma_dev, iter, &vec);
+	    dma_iova_try_alloc(dma_dev, state, map_iter->paddr, total_len))
+		return blk_rq_dma_map_iova(req, dma_dev, state, iter);
+	return blk_dma_map_direct(req, dma_dev, iter);
 }
 EXPORT_SYMBOL_GPL(blk_rq_dma_map_iter_start);
 
@@ -211,14 +204,12 @@ EXPORT_SYMBOL_GPL(blk_rq_dma_map_iter_start);
 bool blk_rq_dma_map_iter_next(struct request *req, struct device *dma_dev,
 		struct dma_iova_state *state, struct blk_dma_iter *iter)
 {
-	struct phys_vec vec;
-
-	if (!blk_map_iter_next(req, &iter->iter, &vec))
+	if (!blk_map_iter_next(req, &iter->iter))
 		return false;
 
 	if (iter->p2pdma.map == PCI_P2PDMA_MAP_BUS_ADDR)
-		return blk_dma_map_bus(iter, &vec);
-	return blk_dma_map_direct(req, dma_dev, iter, &vec);
+		return blk_dma_map_bus(iter);
+	return blk_dma_map_direct(req, dma_dev, iter);
 }
 EXPORT_SYMBOL_GPL(blk_rq_dma_map_iter_next);
 
@@ -246,20 +237,20 @@ blk_next_sg(struct scatterlist **sg, struct scatterlist *sglist)
 int __blk_rq_map_sg(struct request *rq, struct scatterlist *sglist,
 		    struct scatterlist **last_sg)
 {
-	struct req_iterator iter = {
-		.bio	= rq->bio,
+	struct bio *bio = rq->bio;
+	struct blk_map_iter iter = {
+		.bio	= bio,
 	};
-	struct phys_vec vec;
 	int nsegs = 0;
 
 	/* the internal flush request may not have bio attached */
-	if (iter.bio)
-		iter.iter = iter.bio->bi_iter;
+	if (bio)
+		iter.iter = bio->bi_iter;
 
-	while (blk_map_iter_next(rq, &iter, &vec)) {
+	while (blk_map_iter_next(rq, &iter)) {
 		*last_sg = blk_next_sg(last_sg, sglist);
-		sg_set_page(*last_sg, phys_to_page(vec.paddr), vec.len,
-				offset_in_page(vec.paddr));
+		sg_set_page(*last_sg, phys_to_page(iter.paddr), iter.len,
+				offset_in_page(iter.paddr));
 		nsegs++;
 	}
 
diff --git a/include/linux/blk-mq-dma.h b/include/linux/blk-mq-dma.h
index c26a01aeae006..1e5988afdb978 100644
--- a/include/linux/blk-mq-dma.h
+++ b/include/linux/blk-mq-dma.h
@@ -5,6 +5,13 @@
 #include <linux/blk-mq.h>
 #include <linux/pci-p2pdma.h>
 
+struct blk_map_iter {
+	phys_addr_t			paddr;
+	u32				len;
+	struct bvec_iter		iter;
+	struct bio			*bio;
+};
+
 struct blk_dma_iter {
 	/* Output address range for this iteration */
 	dma_addr_t			addr;
@@ -14,7 +21,7 @@ struct blk_dma_iter {
 	blk_status_t			status;
 
 	/* Internal to blk_rq_dma_map_iter_* */
-	struct req_iterator		iter;
+	struct blk_map_iter		iter;
 	struct pci_p2pdma_map_state	p2pdma;
 };
 
-- 
2.47.3


^ permalink raw reply related	[flat|nested] 16+ messages in thread

* [PATCHv3 2/7] blk-mq-dma: provide the bio_vec list being iterated
  2025-07-29 14:34 [PATCHv3 0/7] blk dma iter for integrity metadata Keith Busch
  2025-07-29 14:34 ` [PATCHv3 1/7] blk-mq: introduce blk_map_iter Keith Busch
@ 2025-07-29 14:34 ` Keith Busch
  2025-07-29 20:55   ` Keith Busch
  2025-07-30 12:15   ` Kanchan Joshi
  2025-07-29 14:34 ` [PATCHv3 3/7] blk-mq-dma: require unmap caller provide p2p map type Keith Busch
                   ` (4 subsequent siblings)
  6 siblings, 2 replies; 16+ messages in thread
From: Keith Busch @ 2025-07-29 14:34 UTC (permalink / raw)
  To: linux-block, linux-nvme, hch; +Cc: axboe, leonro, Keith Busch

From: Keith Busch <kbusch@kernel.org>

This will make it easier to add different sources of the bvec table,
like for upcoming integrity support, rather than assume to use the bio's
bi_io_vec. It also makes iterating "special" payloads more in common
with iterating normal payloads.

Signed-off-by: Keith Busch <kbusch@kernel.org>
---
 block/blk-mq-dma.c         | 30 ++++++++++++++++--------------
 include/linux/blk-mq-dma.h |  1 +
 2 files changed, 17 insertions(+), 14 deletions(-)

diff --git a/block/blk-mq-dma.c b/block/blk-mq-dma.c
index 61fbdb715220f..08ce66175a7a3 100644
--- a/block/blk-mq-dma.c
+++ b/block/blk-mq-dma.c
@@ -10,23 +10,17 @@ static bool blk_map_iter_next(struct request *req, struct blk_map_iter *iter)
 	unsigned int max_size;
 	struct bio_vec bv;
 
-	if (req->rq_flags & RQF_SPECIAL_PAYLOAD) {
-		if (!iter->bio)
-			return false;
-		iter->paddr = bvec_phys(&req->special_vec);
-		iter->len = req->special_vec.bv_len;
-		iter->bio = NULL;
-		return true;
-	}
-
 	if (!iter->iter.bi_size)
 		return false;
 
-	bv = mp_bvec_iter_bvec(iter->bio->bi_io_vec, iter->iter);
+	bv = mp_bvec_iter_bvec(iter->bvec, iter->iter);
 	iter->paddr = bvec_phys(&bv);
 	max_size = get_max_segment_size(&req->q->limits, iter->paddr, UINT_MAX);
 	bv.bv_len = min(bv.bv_len, max_size);
-	bio_advance_iter_single(iter->bio, &iter->iter, bv.bv_len);
+	bvec_iter_advance_single(iter->bvec, &iter->iter, bv.bv_len);
+
+	if (req->rq_flags & RQF_SPECIAL_PAYLOAD)
+		return true;
 
 	/*
 	 * If we are entirely done with this bi_io_vec entry, check if the next
@@ -40,15 +34,16 @@ static bool blk_map_iter_next(struct request *req, struct blk_map_iter *iter)
 		if (!iter->iter.bi_size) {
 			iter->bio = iter->bio->bi_next;
 			iter->iter = iter->bio->bi_iter;
+			iter->bvec = iter->bio->bi_io_vec;
 		}
 
-		next = mp_bvec_iter_bvec(iter->bio->bi_io_vec, iter->iter);
+		next = mp_bvec_iter_bvec(iter->bvec, iter->iter);
 		if (bv.bv_len + next.bv_len > max_size ||
 		    !biovec_phys_mergeable(req->q, &bv, &next))
 			break;
 
 		bv.bv_len += next.bv_len;
-		bio_advance_iter_single(iter->bio, &iter->iter, next.bv_len);
+		bvec_iter_advance_single(iter->bvec, &iter->iter, next.bv_len);
 	}
 
 	iter->len = bv.bv_len;
@@ -151,6 +146,11 @@ bool blk_rq_dma_map_iter_start(struct request *req, struct device *dma_dev,
 	memset(&iter->p2pdma, 0, sizeof(iter->p2pdma));
 	iter->status = BLK_STS_OK;
 
+	if (req->rq_flags & RQF_SPECIAL_PAYLOAD)
+		iter->iter.bvec = &req->special_vec;
+	else
+		iter->iter.bvec = req->bio->bi_io_vec;
+
 	/*
 	 * Grab the first segment ASAP because we'll need it to check for P2P
 	 * transfers.
@@ -244,8 +244,10 @@ int __blk_rq_map_sg(struct request *rq, struct scatterlist *sglist,
 	int nsegs = 0;
 
 	/* the internal flush request may not have bio attached */
-	if (bio)
+	if (bio) {
 		iter.iter = bio->bi_iter;
+		iter.bvec = bio->bi_io_vec;
+	}
 
 	while (blk_map_iter_next(rq, &iter)) {
 		*last_sg = blk_next_sg(last_sg, sglist);
diff --git a/include/linux/blk-mq-dma.h b/include/linux/blk-mq-dma.h
index 1e5988afdb978..c82f880dee914 100644
--- a/include/linux/blk-mq-dma.h
+++ b/include/linux/blk-mq-dma.h
@@ -8,6 +8,7 @@
 struct blk_map_iter {
 	phys_addr_t			paddr;
 	u32				len;
+	struct bio_vec                  *bvec;
 	struct bvec_iter		iter;
 	struct bio			*bio;
 };
-- 
2.47.3


^ permalink raw reply related	[flat|nested] 16+ messages in thread

* [PATCHv3 3/7] blk-mq-dma: require unmap caller provide p2p map type
  2025-07-29 14:34 [PATCHv3 0/7] blk dma iter for integrity metadata Keith Busch
  2025-07-29 14:34 ` [PATCHv3 1/7] blk-mq: introduce blk_map_iter Keith Busch
  2025-07-29 14:34 ` [PATCHv3 2/7] blk-mq-dma: provide the bio_vec list being iterated Keith Busch
@ 2025-07-29 14:34 ` Keith Busch
  2025-07-29 14:34 ` [PATCHv3 4/7] blk-mq: remove REQ_P2PDMA flag Keith Busch
                   ` (3 subsequent siblings)
  6 siblings, 0 replies; 16+ messages in thread
From: Keith Busch @ 2025-07-29 14:34 UTC (permalink / raw)
  To: linux-block, linux-nvme, hch; +Cc: axboe, leonro, Keith Busch

From: Keith Busch <kbusch@kernel.org>

In preparing for integrity dma mappings, we can't rely on the request
flag because data and metadata may have different mapping types.

Signed-off-by: Keith Busch <kbusch@kernel.org>
---
 drivers/nvme/host/pci.c    | 9 ++++++++-
 include/linux/blk-mq-dma.h | 5 +++--
 2 files changed, 11 insertions(+), 3 deletions(-)

diff --git a/drivers/nvme/host/pci.c b/drivers/nvme/host/pci.c
index 071efec25346f..6cefa8344f670 100644
--- a/drivers/nvme/host/pci.c
+++ b/drivers/nvme/host/pci.c
@@ -261,6 +261,9 @@ enum nvme_iod_flags {
 
 	/* single segment dma mapping */
 	IOD_SINGLE_SEGMENT	= 1U << 2,
+
+	/* DMA mapped with PCI_P2PDMA_MAP_BUS_ADDR */
+	IOD_P2P_BUS_ADDR	= 1U << 3,
 };
 
 struct nvme_dma_vec {
@@ -725,7 +728,8 @@ static void nvme_unmap_data(struct request *req)
 		return;
 	}
 
-	if (!blk_rq_dma_unmap(req, dma_dev, &iod->dma_state, iod->total_len)) {
+	if (!blk_rq_dma_unmap(req, dma_dev, &iod->dma_state, iod->total_len,
+				iod->flags & IOD_P2P_BUS_ADDR)) {
 		if (nvme_pci_cmd_use_sgl(&iod->cmd))
 			nvme_free_sgls(req);
 		else
@@ -1000,6 +1004,9 @@ static blk_status_t nvme_map_data(struct request *req)
 	if (!blk_rq_dma_map_iter_start(req, dev->dev, &iod->dma_state, &iter))
 		return iter.status;
 
+	if (iter.p2pdma.map == PCI_P2PDMA_MAP_BUS_ADDR)
+		iod->flags |= IOD_P2P_BUS_ADDR;
+
 	if (use_sgl == SGL_FORCED ||
 	    (use_sgl == SGL_SUPPORTED &&
 	     (sgl_threshold && nvme_pci_avg_seg_size(req) >= sgl_threshold)))
diff --git a/include/linux/blk-mq-dma.h b/include/linux/blk-mq-dma.h
index c82f880dee914..aef8d784952ff 100644
--- a/include/linux/blk-mq-dma.h
+++ b/include/linux/blk-mq-dma.h
@@ -49,14 +49,15 @@ static inline bool blk_rq_dma_map_coalesce(struct dma_iova_state *state)
  * @dma_dev:	device to unmap from
  * @state:	DMA IOVA state
  * @mapped_len: number of bytes to unmap
+ * @is_p2p:	true if mapped with PCI_P2PDMA_MAP_BUS_ADDR
  *
  * Returns %false if the callers need to manually unmap every DMA segment
  * mapped using @iter or %true if no work is left to be done.
  */
 static inline bool blk_rq_dma_unmap(struct request *req, struct device *dma_dev,
-		struct dma_iova_state *state, size_t mapped_len)
+		struct dma_iova_state *state, size_t mapped_len, bool is_p2p)
 {
-	if (req->cmd_flags & REQ_P2PDMA)
+	if (is_p2p)
 		return true;
 
 	if (dma_use_iova(state)) {
-- 
2.47.3


^ permalink raw reply related	[flat|nested] 16+ messages in thread

* [PATCHv3 4/7] blk-mq: remove REQ_P2PDMA flag
  2025-07-29 14:34 [PATCHv3 0/7] blk dma iter for integrity metadata Keith Busch
                   ` (2 preceding siblings ...)
  2025-07-29 14:34 ` [PATCHv3 3/7] blk-mq-dma: require unmap caller provide p2p map type Keith Busch
@ 2025-07-29 14:34 ` Keith Busch
  2025-07-29 14:34 ` [PATCHv3 5/7] blk-mq-dma: move common dma start code to a helper Keith Busch
                   ` (2 subsequent siblings)
  6 siblings, 0 replies; 16+ messages in thread
From: Keith Busch @ 2025-07-29 14:34 UTC (permalink / raw)
  To: linux-block, linux-nvme, hch; +Cc: axboe, leonro, Keith Busch

From: Keith Busch <kbusch@kernel.org>

It's not serving any particular purpose. pci_p2pdma_state() already has
all the appropriate checks, so the CONFIG and flag checks are not
guarding anything.

Signed-off-by: Keith Busch <kbusch@kernel.org>
---
 block/bio.c               |  2 +-
 block/blk-mq-dma.c        | 30 ++++++++++++++----------------
 include/linux/blk_types.h |  1 -
 3 files changed, 15 insertions(+), 18 deletions(-)

diff --git a/block/bio.c b/block/bio.c
index 92c512e876c8d..f56d285e6958e 100644
--- a/block/bio.c
+++ b/block/bio.c
@@ -981,7 +981,7 @@ void __bio_add_page(struct bio *bio, struct page *page,
 	WARN_ON_ONCE(bio_full(bio, len));
 
 	if (is_pci_p2pdma_page(page))
-		bio->bi_opf |= REQ_P2PDMA | REQ_NOMERGE;
+		bio->bi_opf |= REQ_NOMERGE;
 
 	bvec_set_page(&bio->bi_io_vec[bio->bi_vcnt], page, len, off);
 	bio->bi_iter.bi_size += len;
diff --git a/block/blk-mq-dma.c b/block/blk-mq-dma.c
index 08ce66175a7a3..87c9a7bfa090d 100644
--- a/block/blk-mq-dma.c
+++ b/block/blk-mq-dma.c
@@ -158,22 +158,20 @@ bool blk_rq_dma_map_iter_start(struct request *req, struct device *dma_dev,
 	if (!blk_map_iter_next(req, map_iter))
 		return false;
 
-	if (IS_ENABLED(CONFIG_PCI_P2PDMA) && (req->cmd_flags & REQ_P2PDMA)) {
-		switch (pci_p2pdma_state(&iter->p2pdma, dma_dev,
-					 phys_to_page(map_iter->paddr))) {
-		case PCI_P2PDMA_MAP_BUS_ADDR:
-			return blk_dma_map_bus(iter);
-		case PCI_P2PDMA_MAP_THRU_HOST_BRIDGE:
-			/*
-			 * P2P transfers through the host bridge are treated the
-			 * same as non-P2P transfers below and during unmap.
-			 */
-			req->cmd_flags &= ~REQ_P2PDMA;
-			break;
-		default:
-			iter->status = BLK_STS_INVAL;
-			return false;
-		}
+	switch (pci_p2pdma_state(&iter->p2pdma, dma_dev,
+				 phys_to_page(map_iter->paddr))) {
+	case PCI_P2PDMA_MAP_BUS_ADDR:
+		return blk_dma_map_bus(iter);
+	case PCI_P2PDMA_MAP_THRU_HOST_BRIDGE:
+		/*
+		 * P2P transfers through the host bridge are treated the
+		 * same as non-P2P transfers below and during unmap.
+		 */
+	case PCI_P2PDMA_MAP_NONE:
+		break;
+	default:
+		iter->status = BLK_STS_INVAL;
+		return false;
 	}
 
 	if (blk_can_dma_map_iova(req, dma_dev) &&
diff --git a/include/linux/blk_types.h b/include/linux/blk_types.h
index 09b99d52fd365..0a29b20939d17 100644
--- a/include/linux/blk_types.h
+++ b/include/linux/blk_types.h
@@ -419,7 +419,6 @@ enum req_flag_bits {
 #define REQ_DRV		(__force blk_opf_t)(1ULL << __REQ_DRV)
 #define REQ_FS_PRIVATE	(__force blk_opf_t)(1ULL << __REQ_FS_PRIVATE)
 #define REQ_ATOMIC	(__force blk_opf_t)(1ULL << __REQ_ATOMIC)
-#define REQ_P2PDMA	(__force blk_opf_t)(1ULL << __REQ_P2PDMA)
 
 #define REQ_NOUNMAP	(__force blk_opf_t)(1ULL << __REQ_NOUNMAP)
 
-- 
2.47.3


^ permalink raw reply related	[flat|nested] 16+ messages in thread

* [PATCHv3 5/7] blk-mq-dma: move common dma start code to a helper
  2025-07-29 14:34 [PATCHv3 0/7] blk dma iter for integrity metadata Keith Busch
                   ` (3 preceding siblings ...)
  2025-07-29 14:34 ` [PATCHv3 4/7] blk-mq: remove REQ_P2PDMA flag Keith Busch
@ 2025-07-29 14:34 ` Keith Busch
  2025-07-29 14:34 ` [PATCHv3 6/7] blk-mq-dma: add support for mapping integrity metadata Keith Busch
  2025-07-29 14:34 ` [PATCHv3 7/7] nvme: convert metadata mapping to dma iter Keith Busch
  6 siblings, 0 replies; 16+ messages in thread
From: Keith Busch @ 2025-07-29 14:34 UTC (permalink / raw)
  To: linux-block, linux-nvme, hch; +Cc: axboe, leonro, Keith Busch

From: Keith Busch <kbusch@kernel.org>

In preparing for dma mapping integrity metadata, move the common dma
setup to a helper.

Signed-off-by: Keith Busch <kbusch@kernel.org>
---
 block/blk-mq-dma.c | 69 +++++++++++++++++++++++++---------------------
 1 file changed, 38 insertions(+), 31 deletions(-)

diff --git a/block/blk-mq-dma.c b/block/blk-mq-dma.c
index 87c9a7bfa090d..646caa00a0485 100644
--- a/block/blk-mq-dma.c
+++ b/block/blk-mq-dma.c
@@ -113,44 +113,16 @@ static bool blk_rq_dma_map_iova(struct request *req, struct device *dma_dev,
 	return true;
 }
 
-/**
- * blk_rq_dma_map_iter_start - map the first DMA segment for a request
- * @req:	request to map
- * @dma_dev:	device to map to
- * @state:	DMA IOVA state
- * @iter:	block layer DMA iterator
- *
- * Start DMA mapping @req to @dma_dev.  @state and @iter are provided by the
- * caller and don't need to be initialized.  @state needs to be stored for use
- * at unmap time, @iter is only needed at map time.
- *
- * Returns %false if there is no segment to map, including due to an error, or
- * %true ft it did map a segment.
- *
- * If a segment was mapped, the DMA address for it is returned in @iter.addr and
- * the length in @iter.len.  If no segment was mapped the status code is
- * returned in @iter.status.
- *
- * The caller can call blk_rq_dma_map_coalesce() to check if further segments
- * need to be mapped after this, or go straight to blk_rq_dma_map_iter_next()
- * to try to map the following segments.
- */
-bool blk_rq_dma_map_iter_start(struct request *req, struct device *dma_dev,
-		struct dma_iova_state *state, struct blk_dma_iter *iter)
+static bool blk_dma_map_iter_start(struct request *req, struct device *dma_dev,
+		struct dma_iova_state *state, struct blk_dma_iter *iter,
+		unsigned int total_len)
 {
-	unsigned int total_len = blk_rq_payload_bytes(req);
 	struct blk_map_iter *map_iter = &iter->iter;
 
 	map_iter->bio = req->bio;
-	map_iter->iter = req->bio->bi_iter;
 	memset(&iter->p2pdma, 0, sizeof(iter->p2pdma));
 	iter->status = BLK_STS_OK;
 
-	if (req->rq_flags & RQF_SPECIAL_PAYLOAD)
-		iter->iter.bvec = &req->special_vec;
-	else
-		iter->iter.bvec = req->bio->bi_io_vec;
-
 	/*
 	 * Grab the first segment ASAP because we'll need it to check for P2P
 	 * transfers.
@@ -179,6 +151,41 @@ bool blk_rq_dma_map_iter_start(struct request *req, struct device *dma_dev,
 		return blk_rq_dma_map_iova(req, dma_dev, state, iter);
 	return blk_dma_map_direct(req, dma_dev, iter);
 }
+
+/**
+ * blk_rq_dma_map_iter_start - map the first DMA segment for a request
+ * @req:	request to map
+ * @dma_dev:	device to map to
+ * @state:	DMA IOVA state
+ * @iter:	block layer DMA iterator
+ *
+ * Start DMA mapping @req to @dma_dev.  @state and @iter are provided by the
+ * caller and don't need to be initialized.  @state needs to be stored for use
+ * at unmap time, @iter is only needed at map time.
+ *
+ * Returns %false if there is no segment to map, including due to an error, or
+ * %true ft it did map a segment.
+ *
+ * If a segment was mapped, the DMA address for it is returned in @iter.addr and
+ * the length in @iter.len.  If no segment was mapped the status code is
+ * returned in @iter.status.
+ *
+ * The caller can call blk_rq_dma_map_coalesce() to check if further segments
+ * need to be mapped after this, or go straight to blk_rq_dma_map_iter_next()
+ * to try to map the following segments.
+ */
+bool blk_rq_dma_map_iter_start(struct request *req, struct device *dma_dev,
+		struct dma_iova_state *state, struct blk_dma_iter *iter)
+{
+	iter->iter.iter = req->bio->bi_iter;
+	if (req->rq_flags & RQF_SPECIAL_PAYLOAD)
+		iter->iter.bvec = &req->special_vec;
+	else
+		iter->iter.bvec = req->bio->bi_io_vec;
+
+	return blk_dma_map_iter_start(req, dma_dev, state, iter,
+				      blk_rq_payload_bytes(req));
+}
 EXPORT_SYMBOL_GPL(blk_rq_dma_map_iter_start);
 
 /**
-- 
2.47.3


^ permalink raw reply related	[flat|nested] 16+ messages in thread

* [PATCHv3 6/7] blk-mq-dma: add support for mapping integrity metadata
  2025-07-29 14:34 [PATCHv3 0/7] blk dma iter for integrity metadata Keith Busch
                   ` (4 preceding siblings ...)
  2025-07-29 14:34 ` [PATCHv3 5/7] blk-mq-dma: move common dma start code to a helper Keith Busch
@ 2025-07-29 14:34 ` Keith Busch
  2025-07-29 14:34 ` [PATCHv3 7/7] nvme: convert metadata mapping to dma iter Keith Busch
  6 siblings, 0 replies; 16+ messages in thread
From: Keith Busch @ 2025-07-29 14:34 UTC (permalink / raw)
  To: linux-block, linux-nvme, hch; +Cc: axboe, leonro, Keith Busch

From: Keith Busch <kbusch@kernel.org>

Provide integrity metadata helpers equivalent to the data payload
helpers for iterating a request for dma setup.

Signed-off-by: Keith Busch <kbusch@kernel.org>
---
 block/blk-mq-dma.c            | 54 +++++++++++++++++++++++++++++++----
 include/linux/blk-integrity.h | 17 +++++++++++
 include/linux/blk-mq-dma.h    |  1 +
 3 files changed, 67 insertions(+), 5 deletions(-)

diff --git a/block/blk-mq-dma.c b/block/blk-mq-dma.c
index 646caa00a0485..708fb6f51efc7 100644
--- a/block/blk-mq-dma.c
+++ b/block/blk-mq-dma.c
@@ -2,9 +2,28 @@
 /*
  * Copyright (C) 2025 Christoph Hellwig
  */
+#include <linux/blk-integrity.h>
 #include <linux/blk-mq-dma.h>
 #include "blk.h"
 
+static bool __blk_map_iter_next(struct blk_map_iter *iter)
+{
+	if (iter->iter.bi_size)
+		return true;
+	if (!iter->bio->bi_next)
+		return false;
+
+	iter->bio = iter->bio->bi_next;
+	if (iter->is_integrity) {
+		iter->iter = iter->bio->bi_integrity->bip_iter;
+		iter->bvec = iter->bio->bi_integrity->bip_vec;
+	} else {
+		iter->iter = iter->bio->bi_iter;
+		iter->bvec = iter->bio->bi_io_vec;
+	}
+	return true;
+}
+
 static bool blk_map_iter_next(struct request *req, struct blk_map_iter *iter)
 {
 	unsigned int max_size;
@@ -31,11 +50,8 @@ static bool blk_map_iter_next(struct request *req, struct blk_map_iter *iter)
 	       (!iter->iter.bi_bvec_done && iter->bio->bi_next)) {
 		struct bio_vec next;
 
-		if (!iter->iter.bi_size) {
-			iter->bio = iter->bio->bi_next;
-			iter->iter = iter->bio->bi_iter;
-			iter->bvec = iter->bio->bi_io_vec;
-		}
+		if (!__blk_map_iter_next(iter))
+			break;
 
 		next = mp_bvec_iter_bvec(iter->bvec, iter->iter);
 		if (bv.bv_len + next.bv_len > max_size ||
@@ -178,6 +194,7 @@ bool blk_rq_dma_map_iter_start(struct request *req, struct device *dma_dev,
 		struct dma_iova_state *state, struct blk_dma_iter *iter)
 {
 	iter->iter.iter = req->bio->bi_iter;
+	iter->iter.is_integrity = false;
 	if (req->rq_flags & RQF_SPECIAL_PAYLOAD)
 		iter->iter.bvec = &req->special_vec;
 	else
@@ -218,6 +235,33 @@ bool blk_rq_dma_map_iter_next(struct request *req, struct device *dma_dev,
 }
 EXPORT_SYMBOL_GPL(blk_rq_dma_map_iter_next);
 
+#ifdef CONFIG_BLK_DEV_INTEGRITY
+bool blk_rq_integrity_dma_map_iter_start(struct request *req,
+		struct device *dma_dev,  struct dma_iova_state *state,
+		struct blk_dma_iter *iter)
+{
+	unsigned len = bio_integrity_bytes(&req->q->limits.integrity,
+					   blk_rq_sectors(req));
+	iter->iter.iter = req->bio->bi_integrity->bip_iter;
+	iter->iter.bvec = req->bio->bi_integrity->bip_vec;
+	iter->iter.is_integrity = true;
+	return blk_dma_map_iter_start(req, dma_dev, state, iter, len);
+}
+EXPORT_SYMBOL_GPL(blk_rq_integrity_dma_map_iter_start);
+
+bool blk_rq_integrity_dma_map_iter_next(struct request *req,
+               struct device *dma_dev, struct blk_dma_iter *iter)
+{
+	if (!blk_map_iter_next(req, &iter->iter))
+		return false;
+
+	if (iter->p2pdma.map == PCI_P2PDMA_MAP_BUS_ADDR)
+		return blk_dma_map_bus(iter);
+	return blk_dma_map_direct(req, dma_dev, iter);
+}
+EXPORT_SYMBOL_GPL(blk_rq_integrity_dma_map_iter_next);
+#endif
+
 static inline struct scatterlist *
 blk_next_sg(struct scatterlist **sg, struct scatterlist *sglist)
 {
diff --git a/include/linux/blk-integrity.h b/include/linux/blk-integrity.h
index c7eae0bfb013f..1ef0c373297b3 100644
--- a/include/linux/blk-integrity.h
+++ b/include/linux/blk-integrity.h
@@ -4,6 +4,7 @@
 
 #include <linux/blk-mq.h>
 #include <linux/bio-integrity.h>
+#include <linux/blk-mq-dma.h>
 
 struct request;
 
@@ -29,6 +30,11 @@ int blk_rq_map_integrity_sg(struct request *, struct scatterlist *);
 int blk_rq_count_integrity_sg(struct request_queue *, struct bio *);
 int blk_rq_integrity_map_user(struct request *rq, void __user *ubuf,
 			      ssize_t bytes);
+bool blk_rq_integrity_dma_map_iter_start(struct request *req,
+		struct device *dma_dev,  struct dma_iova_state *state,
+		struct blk_dma_iter *iter);
+bool blk_rq_integrity_dma_map_iter_next(struct request *req,
+		struct device *dma_dev, struct blk_dma_iter *iter);
 
 static inline bool
 blk_integrity_queue_supports_integrity(struct request_queue *q)
@@ -108,6 +114,17 @@ static inline int blk_rq_integrity_map_user(struct request *rq,
 {
 	return -EINVAL;
 }
+static inline bool blk_rq_integrity_dma_map_iter_start(struct request *req,
+		struct device *dma_dev,  struct dma_iova_state *state,
+		struct blk_dma_iter *iter)
+{
+	return false;
+}
+static inline bool blk_rq_integrity_dma_map_iter_next(struct request *req,
+		struct device *dma_dev, struct blk_dma_iter *iter)
+{
+	return false;
+}
 static inline struct blk_integrity *bdev_get_integrity(struct block_device *b)
 {
 	return NULL;
diff --git a/include/linux/blk-mq-dma.h b/include/linux/blk-mq-dma.h
index aef8d784952ff..4b7b85f9e23e6 100644
--- a/include/linux/blk-mq-dma.h
+++ b/include/linux/blk-mq-dma.h
@@ -11,6 +11,7 @@ struct blk_map_iter {
 	struct bio_vec                  *bvec;
 	struct bvec_iter		iter;
 	struct bio			*bio;
+	bool				is_integrity;
 };
 
 struct blk_dma_iter {
-- 
2.47.3


^ permalink raw reply related	[flat|nested] 16+ messages in thread

* [PATCHv3 7/7] nvme: convert metadata mapping to dma iter
  2025-07-29 14:34 [PATCHv3 0/7] blk dma iter for integrity metadata Keith Busch
                   ` (5 preceding siblings ...)
  2025-07-29 14:34 ` [PATCHv3 6/7] blk-mq-dma: add support for mapping integrity metadata Keith Busch
@ 2025-07-29 14:34 ` Keith Busch
  2025-07-29 18:56   ` Keith Busch
  6 siblings, 1 reply; 16+ messages in thread
From: Keith Busch @ 2025-07-29 14:34 UTC (permalink / raw)
  To: linux-block, linux-nvme, hch; +Cc: axboe, leonro, Keith Busch

From: Keith Busch <kbusch@kernel.org>

Aligns data and metadata to the similar dma mapping scheme and removes
one more user of the scatter-gather dma mapping.

Signed-off-by: Keith Busch <kbusch@kernel.org>
---
 drivers/nvme/host/pci.c | 174 +++++++++++++++++++---------------------
 1 file changed, 81 insertions(+), 93 deletions(-)

diff --git a/drivers/nvme/host/pci.c b/drivers/nvme/host/pci.c
index 6cefa8344f670..6fdc0023c4abb 100644
--- a/drivers/nvme/host/pci.c
+++ b/drivers/nvme/host/pci.c
@@ -172,9 +172,7 @@ struct nvme_dev {
 	u32 last_ps;
 	bool hmb;
 	struct sg_table *hmb_sgt;
-
 	mempool_t *dmavec_mempool;
-	mempool_t *iod_meta_mempool;
 
 	/* shadow doorbell buffer support: */
 	__le32 *dbbuf_dbs;
@@ -264,6 +262,12 @@ enum nvme_iod_flags {
 
 	/* DMA mapped with PCI_P2PDMA_MAP_BUS_ADDR */
 	IOD_P2P_BUS_ADDR	= 1U << 3,
+
+	/* Metadata DMA mapped with PCI_P2PDMA_MAP_BUS_ADDR */
+	IOD_META_P2P_BUS_ADDR	= 1U << 4,
+
+	/* Metadata DMA mapped as a single SGL data segment */
+	IOD_META_SINGLE_SEG	= 1U << 5,
 };
 
 struct nvme_dma_vec {
@@ -281,13 +285,14 @@ struct nvme_iod {
 	u8 nr_descriptors;
 
 	unsigned int total_len;
+	unsigned int meta_total_len;
 	struct dma_iova_state dma_state;
+	struct dma_iova_state meta_dma_state;
 	void *descriptors[NVME_MAX_NR_DESCRIPTORS];
 	struct nvme_dma_vec *dma_vecs;
 	unsigned int nr_dma_vecs;
 
 	dma_addr_t meta_dma;
-	struct sg_table meta_sgt;
 	struct nvme_sgl_desc *meta_descriptor;
 };
 
@@ -604,8 +609,7 @@ static inline bool nvme_pci_metadata_use_sgls(struct request *req)
 
 	if (!nvme_ctrl_meta_sgl_supported(&dev->ctrl))
 		return false;
-	return req->nr_integrity_segments > 1 ||
-		nvme_req(req)->flags & NVME_REQ_USERCMD;
+	return nvme_req(req)->flags & NVME_REQ_USERCMD;
 }
 
 static inline enum nvme_use_sgl nvme_pci_use_sgls(struct nvme_dev *dev,
@@ -644,6 +648,11 @@ static inline struct dma_pool *nvme_dma_pool(struct nvme_queue *nvmeq,
 	return nvmeq->descriptor_pools.large;
 }
 
+static inline bool nvme_pci_cmd_use_meta_sgl(struct nvme_command *cmd)
+{
+	return (cmd->common.flags & NVME_CMD_SGL_ALL) == NVME_CMD_SGL_METASEG;
+}
+
 static inline bool nvme_pci_cmd_use_sgl(struct nvme_command *cmd)
 {
 	return cmd->common.flags &
@@ -1014,93 +1023,65 @@ static blk_status_t nvme_map_data(struct request *req)
 	return nvme_pci_setup_data_prp(req, &iter);
 }
 
-static void nvme_pci_sgl_set_data_sg(struct nvme_sgl_desc *sge,
-		struct scatterlist *sg)
-{
-	sge->addr = cpu_to_le64(sg_dma_address(sg));
-	sge->length = cpu_to_le32(sg_dma_len(sg));
-	sge->type = NVME_SGL_FMT_DATA_DESC << 4;
-}
-
-static blk_status_t nvme_pci_setup_meta_sgls(struct request *req)
+static blk_status_t nvme_map_metadata(struct request *req)
 {
 	struct nvme_queue *nvmeq = req->mq_hctx->driver_data;
-	struct nvme_dev *dev = nvmeq->dev;
+	unsigned int entries = req->nr_integrity_segments;
 	struct nvme_iod *iod = blk_mq_rq_to_pdu(req);
+	struct nvme_dev *dev = nvmeq->dev;
 	struct nvme_sgl_desc *sg_list;
-	struct scatterlist *sgl, *sg;
-	unsigned int entries;
+	struct blk_dma_iter iter;
 	dma_addr_t sgl_dma;
-	int rc, i;
-
-	iod->meta_sgt.sgl = mempool_alloc(dev->iod_meta_mempool, GFP_ATOMIC);
-	if (!iod->meta_sgt.sgl)
-		return BLK_STS_RESOURCE;
+	int i;
 
-	sg_init_table(iod->meta_sgt.sgl, req->nr_integrity_segments);
-	iod->meta_sgt.orig_nents = blk_rq_map_integrity_sg(req,
-							   iod->meta_sgt.sgl);
-	if (!iod->meta_sgt.orig_nents)
-		goto out_free_sg;
+	if (!blk_rq_integrity_dma_map_iter_start(req, dev->dev,
+						&iod->meta_dma_state, &iter))
+		return iter.status;
 
-	rc = dma_map_sgtable(dev->dev, &iod->meta_sgt, rq_dma_dir(req),
-			     DMA_ATTR_NO_WARN);
-	if (rc)
-		goto out_free_sg;
+	if (iter.p2pdma.map == PCI_P2PDMA_MAP_BUS_ADDR)
+		iod->flags |= IOD_META_P2P_BUS_ADDR;
+	else if (blk_rq_dma_map_coalesce(&iod->meta_dma_state))
+		entries = 1;
+
+	if (WARN_ON_ONCE(!nvme_ctrl_meta_sgl_supported(&dev->ctrl) &&
+			 entries > 1))
+		return BLK_STS_NOTSUPP;
+
+	if (entries == 1 && !nvme_pci_metadata_use_sgls(req)) {
+		iod->cmd.common.metadata = cpu_to_le64(iter.addr);
+		iod->meta_total_len = iter.len;
+		iod->meta_dma = iter.addr;
+		iod->meta_descriptor = NULL;
+		return BLK_STS_OK;
+	}
 
 	sg_list = dma_pool_alloc(nvmeq->descriptor_pools.small, GFP_ATOMIC,
 			&sgl_dma);
 	if (!sg_list)
-		goto out_unmap_sg;
+		return BLK_STS_RESOURCE;
 
-	entries = iod->meta_sgt.nents;
 	iod->meta_descriptor = sg_list;
 	iod->meta_dma = sgl_dma;
 
 	iod->cmd.common.flags = NVME_CMD_SGL_METASEG;
 	iod->cmd.common.metadata = cpu_to_le64(sgl_dma);
 
-	sgl = iod->meta_sgt.sgl;
 	if (entries == 1) {
-		nvme_pci_sgl_set_data_sg(sg_list, sgl);
+		iod->flags |= IOD_META_SINGLE_SEG;
+		iod->meta_total_len = iter.len;
+		nvme_pci_sgl_set_data(sg_list, &iter);
 		return BLK_STS_OK;
 	}
 
 	sgl_dma += sizeof(*sg_list);
-	nvme_pci_sgl_set_seg(sg_list, sgl_dma, entries);
-	for_each_sg(sgl, sg, entries, i)
-		nvme_pci_sgl_set_data_sg(&sg_list[i + 1], sg);
-
-	return BLK_STS_OK;
-
-out_unmap_sg:
-	dma_unmap_sgtable(dev->dev, &iod->meta_sgt, rq_dma_dir(req), 0);
-out_free_sg:
-	mempool_free(iod->meta_sgt.sgl, dev->iod_meta_mempool);
-	return BLK_STS_RESOURCE;
-}
-
-static blk_status_t nvme_pci_setup_meta_mptr(struct request *req)
-{
-	struct nvme_iod *iod = blk_mq_rq_to_pdu(req);
-	struct nvme_queue *nvmeq = req->mq_hctx->driver_data;
-	struct bio_vec bv = rq_integrity_vec(req);
-
-	iod->meta_dma = dma_map_bvec(nvmeq->dev->dev, &bv, rq_dma_dir(req), 0);
-	if (dma_mapping_error(nvmeq->dev->dev, iod->meta_dma))
-		return BLK_STS_IOERR;
-	iod->cmd.common.metadata = cpu_to_le64(iod->meta_dma);
-	return BLK_STS_OK;
-}
-
-static blk_status_t nvme_map_metadata(struct request *req)
-{
-	struct nvme_iod *iod = blk_mq_rq_to_pdu(req);
+	i = 0;
+	do {
+		nvme_pci_sgl_set_data(&sg_list[++i], &iter);
+		iod->meta_total_len += iter.len;
+	} while (blk_rq_integrity_dma_map_iter_next(req, dev->dev, &iter));
 
-	if ((iod->cmd.common.flags & NVME_CMD_SGL_METABUF) &&
-	    nvme_pci_metadata_use_sgls(req))
-		return nvme_pci_setup_meta_sgls(req);
-	return nvme_pci_setup_meta_mptr(req);
+	nvme_pci_sgl_set_seg(sg_list, sgl_dma, i);
+	return iter.status;
 }
 
 static blk_status_t nvme_prep_rq(struct request *req)
@@ -1111,7 +1092,6 @@ static blk_status_t nvme_prep_rq(struct request *req)
 	iod->flags = 0;
 	iod->nr_descriptors = 0;
 	iod->total_len = 0;
-	iod->meta_sgt.nents = 0;
 
 	ret = nvme_setup_cmd(req->q->queuedata, req);
 	if (ret)
@@ -1222,23 +1202,44 @@ static void nvme_queue_rqs(struct rq_list *rqlist)
 	*rqlist = requeue_list;
 }
 
+static void nvme_free_meta_sgls(struct nvme_iod *iod, struct device *dma_dev,
+				enum dma_data_direction dir)
+{
+	struct nvme_sgl_desc *sg_list = iod->meta_descriptor;
+	int i, nr_entries;
+
+	if (iod->flags & IOD_META_SINGLE_SEG) {
+		dma_unmap_page(dma_dev, le64_to_cpu(sg_list->addr),
+				le32_to_cpu(sg_list->length), dir);
+		return;
+	}
+
+	nr_entries = le32_to_cpu(sg_list->length);
+	for (i = 1; i <= nr_entries; i++)
+		dma_unmap_page(dma_dev, le64_to_cpu(sg_list[i].addr),
+				le32_to_cpu(sg_list[i].length), dir);
+}
+
 static __always_inline void nvme_unmap_metadata(struct request *req)
 {
-	struct nvme_iod *iod = blk_mq_rq_to_pdu(req);
 	struct nvme_queue *nvmeq = req->mq_hctx->driver_data;
-	struct nvme_dev *dev = nvmeq->dev;
+	enum dma_data_direction dir = rq_dma_dir(req);
+	struct nvme_iod *iod = blk_mq_rq_to_pdu(req);
+	struct device *dma_dev = nvmeq->dev->dev;
 
-	if (!iod->meta_sgt.nents) {
-		dma_unmap_page(dev->dev, iod->meta_dma,
-			       rq_integrity_vec(req).bv_len,
-			       rq_dma_dir(req));
-		return;
+	if (!blk_rq_dma_unmap(req, dma_dev, &iod->meta_dma_state,
+				iod->meta_total_len,
+				iod->flags & IOD_META_P2P_BUS_ADDR)) {
+		if (nvme_pci_cmd_use_meta_sgl(&iod->cmd))
+			nvme_free_meta_sgls(iod, dma_dev, dir);
+		else
+			dma_unmap_page(dma_dev, iod->meta_dma,
+				       iod->meta_total_len, dir);
 	}
 
-	dma_pool_free(nvmeq->descriptor_pools.small, iod->meta_descriptor,
-			iod->meta_dma);
-	dma_unmap_sgtable(dev->dev, &iod->meta_sgt, rq_dma_dir(req), 0);
-	mempool_free(iod->meta_sgt.sgl, dev->iod_meta_mempool);
+	if (iod->meta_descriptor)
+		dma_pool_free(nvmeq->descriptor_pools.small,
+			      iod->meta_descriptor, iod->meta_dma);
 }
 
 static __always_inline void nvme_pci_unmap_rq(struct request *req)
@@ -3046,7 +3047,6 @@ static int nvme_disable_prepare_reset(struct nvme_dev *dev, bool shutdown)
 
 static int nvme_pci_alloc_iod_mempool(struct nvme_dev *dev)
 {
-	size_t meta_size = sizeof(struct scatterlist) * (NVME_MAX_META_SEGS + 1);
 	size_t alloc_size = sizeof(struct nvme_dma_vec) * NVME_MAX_SEGS;
 
 	dev->dmavec_mempool = mempool_create_node(1,
@@ -3055,17 +3055,7 @@ static int nvme_pci_alloc_iod_mempool(struct nvme_dev *dev)
 			dev_to_node(dev->dev));
 	if (!dev->dmavec_mempool)
 		return -ENOMEM;
-
-	dev->iod_meta_mempool = mempool_create_node(1,
-			mempool_kmalloc, mempool_kfree,
-			(void *)meta_size, GFP_KERNEL,
-			dev_to_node(dev->dev));
-	if (!dev->iod_meta_mempool)
-		goto free;
 	return 0;
-free:
-	mempool_destroy(dev->dmavec_mempool);
-	return -ENOMEM;
 }
 
 static void nvme_free_tagset(struct nvme_dev *dev)
@@ -3515,7 +3505,6 @@ static int nvme_probe(struct pci_dev *pdev, const struct pci_device_id *id)
 	nvme_free_queues(dev, 0);
 out_release_iod_mempool:
 	mempool_destroy(dev->dmavec_mempool);
-	mempool_destroy(dev->iod_meta_mempool);
 out_dev_unmap:
 	nvme_dev_unmap(dev);
 out_uninit_ctrl:
@@ -3579,7 +3568,6 @@ static void nvme_remove(struct pci_dev *pdev)
 	nvme_dbbuf_dma_free(dev);
 	nvme_free_queues(dev, 0);
 	mempool_destroy(dev->dmavec_mempool);
-	mempool_destroy(dev->iod_meta_mempool);
 	nvme_release_descriptor_pools(dev);
 	nvme_dev_unmap(dev);
 	nvme_uninit_ctrl(&dev->ctrl);
-- 
2.47.3


^ permalink raw reply related	[flat|nested] 16+ messages in thread

* Re: [PATCHv3 7/7] nvme: convert metadata mapping to dma iter
  2025-07-29 14:34 ` [PATCHv3 7/7] nvme: convert metadata mapping to dma iter Keith Busch
@ 2025-07-29 18:56   ` Keith Busch
  0 siblings, 0 replies; 16+ messages in thread
From: Keith Busch @ 2025-07-29 18:56 UTC (permalink / raw)
  To: Keith Busch; +Cc: linux-block, linux-nvme, hch, axboe, leonro

On Tue, Jul 29, 2025 at 07:34:42AM -0700, Keith Busch wrote:
> +static blk_status_t nvme_map_metadata(struct request *req)
>  {

...

> +	nvme_pci_sgl_set_seg(sg_list, sgl_dma, i);

This needs an error check and unmap if so, just like data prp/sgl setup:

	if (unlikely(iter->status))
		nvme_unmap_metadata(req);

> +	return iter.status;
>  }

^ permalink raw reply	[flat|nested] 16+ messages in thread

* Re: [PATCHv3 2/7] blk-mq-dma: provide the bio_vec list being iterated
  2025-07-29 14:34 ` [PATCHv3 2/7] blk-mq-dma: provide the bio_vec list being iterated Keith Busch
@ 2025-07-29 20:55   ` Keith Busch
  2025-07-30 12:15   ` Kanchan Joshi
  1 sibling, 0 replies; 16+ messages in thread
From: Keith Busch @ 2025-07-29 20:55 UTC (permalink / raw)
  To: Keith Busch; +Cc: linux-block, linux-nvme, hch, axboe, leonro

On Tue, Jul 29, 2025 at 07:34:37AM -0700, Keith Busch wrote:
> @@ -244,8 +244,10 @@ int __blk_rq_map_sg(struct request *rq, struct scatterlist *sglist,
>  	int nsegs = 0;
>  
>  	/* the internal flush request may not have bio attached */
> -	if (bio)
> +	if (bio) {
>  		iter.iter = bio->bi_iter;
> +		iter.bvec = bio->bi_io_vec;

Oops, this should have been:

		if (rq->rq_flags & RQF_SPECIAL_PAYLOAD)
			iter.bvec = &rq->special_vec;
		else
			iter.bvec = bio->bi_io_vec;

Hopefully over time everything coverts to the dma iterator and this
legacy sg mapping can fade away.

> +	}

^ permalink raw reply	[flat|nested] 16+ messages in thread

* Re: [PATCHv3 1/7] blk-mq: introduce blk_map_iter
  2025-07-29 14:34 ` [PATCHv3 1/7] blk-mq: introduce blk_map_iter Keith Busch
@ 2025-07-30  6:52   ` Kanchan Joshi
  2025-07-30  8:18   ` Kanchan Joshi
  1 sibling, 0 replies; 16+ messages in thread
From: Kanchan Joshi @ 2025-07-30  6:52 UTC (permalink / raw)
  To: Keith Busch, linux-block, linux-nvme, hch; +Cc: axboe, leonro, Keith Busch

On 7/29/2025 8:04 PM, Keith Busch wrote:
> -	iter->addr = dma_map_page(dma_dev, phys_to_page(vec->paddr),
> -			offset_in_page(vec->paddr), vec->len, rq_dma_dir(req));
> -	if (dma_mapping_error(dma_dev, iter->addr)) {
> +	iter->addr = dma_map_page(dma_dev, phys_to_page(iter->iter.paddr),
> +			offset_in_page(iter->iter.paddr), iter->iter.len,
> +			rq_dma_dir(req));
> +	if (dma_mapping_error(dma_dev, iter->iter.paddr)) {

This should be
dma_mapping_error(dma_dev, iter->addr).

^ permalink raw reply	[flat|nested] 16+ messages in thread

* Re: [PATCHv3 1/7] blk-mq: introduce blk_map_iter
  2025-07-29 14:34 ` [PATCHv3 1/7] blk-mq: introduce blk_map_iter Keith Busch
  2025-07-30  6:52   ` Kanchan Joshi
@ 2025-07-30  8:18   ` Kanchan Joshi
  2025-07-30 15:18     ` Keith Busch
  1 sibling, 1 reply; 16+ messages in thread
From: Kanchan Joshi @ 2025-07-30  8:18 UTC (permalink / raw)
  To: Keith Busch, linux-block, linux-nvme, hch; +Cc: axboe, leonro, Keith Busch

On 7/29/2025 8:04 PM, Keith Busch wrote:
> @@ -39,12 +33,11 @@ static bool blk_map_iter_next(struct request *req, struct req_iterator *iter,
>   	 * one could be merged into it.  This typically happens when moving to
>   	 * the next bio, but some callers also don't pack bvecs tight.
>   	 */
> -	while (!iter->iter.bi_size || !iter->iter.bi_bvec_done) {
> +	while (!iter->iter.bi_size ||
> +	       (!iter->iter.bi_bvec_done && iter->bio->bi_next)) {
>   		struct bio_vec next;
>   
>   		if (!iter->iter.bi_size) {
> -			if (!iter->bio->bi_next)
> -				break;
>   			iter->bio = iter->bio->bi_next;
>   			iter->iter = iter->bio->bi_iter;

This can crash here if we come inside the loop because 
iter->iter.bi_size is 0
and if this is the last bio i.e., iter->bio->bi_next is NULL?




^ permalink raw reply	[flat|nested] 16+ messages in thread

* Re: [PATCHv3 2/7] blk-mq-dma: provide the bio_vec list being iterated
  2025-07-29 14:34 ` [PATCHv3 2/7] blk-mq-dma: provide the bio_vec list being iterated Keith Busch
  2025-07-29 20:55   ` Keith Busch
@ 2025-07-30 12:15   ` Kanchan Joshi
  2025-07-30 14:18     ` Keith Busch
  1 sibling, 1 reply; 16+ messages in thread
From: Kanchan Joshi @ 2025-07-30 12:15 UTC (permalink / raw)
  To: Keith Busch, linux-block, linux-nvme, hch; +Cc: axboe, leonro, Keith Busch

On 7/29/2025 8:04 PM, Keith Busch wrote:
> @@ -151,6 +146,11 @@ bool blk_rq_dma_map_iter_start(struct request *req, struct device *dma_dev,
>   	memset(&iter->p2pdma, 0, sizeof(iter->p2pdma));
>   	iter->status = BLK_STS_OK;
>   
> +	if (req->rq_flags & RQF_SPECIAL_PAYLOAD)
> +		iter->iter.bvec = &req->special_vec;

I am not certain yet, but thinking whether this is enough to handle 
RQF_SPECIAL_PAYLOAD correctly.
Maybe "req->special_vec.bv_len" also need to be included here to 
initialize the iter.

^ permalink raw reply	[flat|nested] 16+ messages in thread

* Re: [PATCHv3 2/7] blk-mq-dma: provide the bio_vec list being iterated
  2025-07-30 12:15   ` Kanchan Joshi
@ 2025-07-30 14:18     ` Keith Busch
  0 siblings, 0 replies; 16+ messages in thread
From: Keith Busch @ 2025-07-30 14:18 UTC (permalink / raw)
  To: Kanchan Joshi; +Cc: Keith Busch, linux-block, linux-nvme, hch, axboe, leonro

On Wed, Jul 30, 2025 at 05:45:27PM +0530, Kanchan Joshi wrote:
> On 7/29/2025 8:04 PM, Keith Busch wrote:
> > @@ -151,6 +146,11 @@ bool blk_rq_dma_map_iter_start(struct request *req, struct device *dma_dev,
> >   	memset(&iter->p2pdma, 0, sizeof(iter->p2pdma));
> >   	iter->status = BLK_STS_OK;
> >   
> > +	if (req->rq_flags & RQF_SPECIAL_PAYLOAD)
> > +		iter->iter.bvec = &req->special_vec;
> 
> I am not certain yet, but thinking whether this is enough to handle 
> RQF_SPECIAL_PAYLOAD correctly.
> Maybe "req->special_vec.bv_len" also need to be included here to 
> initialize the iter.

Yeah, I think you're right. I had only tested 'discard' for the special
payload, and it appeared to be okay, but I suspect for the wrong
reasons. I'll fix it up.

^ permalink raw reply	[flat|nested] 16+ messages in thread

* Re: [PATCHv3 1/7] blk-mq: introduce blk_map_iter
  2025-07-30  8:18   ` Kanchan Joshi
@ 2025-07-30 15:18     ` Keith Busch
  2025-07-31  5:05       ` Kanchan Joshi
  0 siblings, 1 reply; 16+ messages in thread
From: Keith Busch @ 2025-07-30 15:18 UTC (permalink / raw)
  To: Kanchan Joshi; +Cc: Keith Busch, linux-block, linux-nvme, hch, axboe, leonro

On Wed, Jul 30, 2025 at 01:48:42PM +0530, Kanchan Joshi wrote:
> On 7/29/2025 8:04 PM, Keith Busch wrote:
> > @@ -39,12 +33,11 @@ static bool blk_map_iter_next(struct request *req, struct req_iterator *iter,
> >   	 * one could be merged into it.  This typically happens when moving to
> >   	 * the next bio, but some callers also don't pack bvecs tight.
> >   	 */
> > -	while (!iter->iter.bi_size || !iter->iter.bi_bvec_done) {
> > +	while (!iter->iter.bi_size ||
> > +	       (!iter->iter.bi_bvec_done && iter->bio->bi_next)) {
> >   		struct bio_vec next;
> >   
> >   		if (!iter->iter.bi_size) {
> > -			if (!iter->bio->bi_next)
> > -				break;
> >   			iter->bio = iter->bio->bi_next;
> >   			iter->iter = iter->bio->bi_iter;
> 
> This can crash here if we come inside the loop because 
> iter->iter.bi_size is 0
> and if this is the last bio i.e., iter->bio->bi_next is NULL?

Nah, I changed the while loop condition to ensure bio->bi_next isn't
NULL if the current bi_size is 0. But I don't recall why I moved the
condition check to there in the first place either.

^ permalink raw reply	[flat|nested] 16+ messages in thread

* Re: [PATCHv3 1/7] blk-mq: introduce blk_map_iter
  2025-07-30 15:18     ` Keith Busch
@ 2025-07-31  5:05       ` Kanchan Joshi
  0 siblings, 0 replies; 16+ messages in thread
From: Kanchan Joshi @ 2025-07-31  5:05 UTC (permalink / raw)
  To: Keith Busch; +Cc: Keith Busch, linux-block, linux-nvme, hch, axboe, leonro

On 7/30/2025 8:48 PM, Keith Busch wrote:
> On Wed, Jul 30, 2025 at 01:48:42PM +0530, Kanchan Joshi wrote:
>> On 7/29/2025 8:04 PM, Keith Busch wrote:
>>> @@ -39,12 +33,11 @@ static bool blk_map_iter_next(struct request *req, struct req_iterator *iter,
>>>    	 * one could be merged into it.  This typically happens when moving to
>>>    	 * the next bio, but some callers also don't pack bvecs tight.
>>>    	 */
>>> -	while (!iter->iter.bi_size || !iter->iter.bi_bvec_done) {
>>> +	while (!iter->iter.bi_size ||
>>> +	       (!iter->iter.bi_bvec_done && iter->bio->bi_next)) {
>>>    		struct bio_vec next;
>>>    
>>>    		if (!iter->iter.bi_size) {
>>> -			if (!iter->bio->bi_next)
>>> -				break;
>>>    			iter->bio = iter->bio->bi_next;
>>>    			iter->iter = iter->bio->bi_iter;
>> This can crash here if we come inside the loop because
>> iter->iter.bi_size is 0
>> and if this is the last bio i.e., iter->bio->bi_next is NULL?
> Nah, I changed the while loop condition to ensure bio->bi_next isn't
> NULL if the current bi_size is 0. But I don't recall why I moved the
> condition check to there in the first place either.

Yes, you moved it, but that is not going to guard when 
iter->iter.bi_size is 0.

while (true || immaterial) {
	..
	if (true) {
		iter->bio = NULL;
		iter->iter = iter->bio->bi_iter;  //crash here
	}
}

^ permalink raw reply	[flat|nested] 16+ messages in thread

end of thread, other threads:[~2025-07-31  5:05 UTC | newest]

Thread overview: 16+ messages (download: mbox.gz follow: Atom feed
-- links below jump to the message on this page --
2025-07-29 14:34 [PATCHv3 0/7] blk dma iter for integrity metadata Keith Busch
2025-07-29 14:34 ` [PATCHv3 1/7] blk-mq: introduce blk_map_iter Keith Busch
2025-07-30  6:52   ` Kanchan Joshi
2025-07-30  8:18   ` Kanchan Joshi
2025-07-30 15:18     ` Keith Busch
2025-07-31  5:05       ` Kanchan Joshi
2025-07-29 14:34 ` [PATCHv3 2/7] blk-mq-dma: provide the bio_vec list being iterated Keith Busch
2025-07-29 20:55   ` Keith Busch
2025-07-30 12:15   ` Kanchan Joshi
2025-07-30 14:18     ` Keith Busch
2025-07-29 14:34 ` [PATCHv3 3/7] blk-mq-dma: require unmap caller provide p2p map type Keith Busch
2025-07-29 14:34 ` [PATCHv3 4/7] blk-mq: remove REQ_P2PDMA flag Keith Busch
2025-07-29 14:34 ` [PATCHv3 5/7] blk-mq-dma: move common dma start code to a helper Keith Busch
2025-07-29 14:34 ` [PATCHv3 6/7] blk-mq-dma: add support for mapping integrity metadata Keith Busch
2025-07-29 14:34 ` [PATCHv3 7/7] nvme: convert metadata mapping to dma iter Keith Busch
2025-07-29 18:56   ` Keith Busch

This is a public inbox, see mirroring instructions
for how to clone and mirror all data and code used for this inbox