* [PATCHv7 1/9] blk-mq-dma: create blk_map_iter type
2025-08-13 15:31 [PATCHv7 0/9] blk dma iter for integrity metadata Keith Busch
@ 2025-08-13 15:31 ` Keith Busch
2025-08-14 10:17 ` Kanchan Joshi
2025-08-13 15:31 ` [PATCHv7 2/9] blk-mq-dma: provide the bio_vec array being iterated Keith Busch
` (10 subsequent siblings)
11 siblings, 1 reply; 26+ messages in thread
From: Keith Busch @ 2025-08-13 15:31 UTC (permalink / raw)
To: linux-block, linux-nvme; +Cc: hch, axboe, joshi.k, Keith Busch
From: Keith Busch <kbusch@kernel.org>
The req_iterator happens to have a similar fields to what the dma
iterator needs, but we're not necessarily iterating a request's
bi_io_vec. Create a new type that can be amended for additional future
use.
Signed-off-by: Keith Busch <kbusch@kernel.org>
Reviewed-by: Christoph Hellwig <hch@lst.de>
---
block/blk-mq-dma.c | 4 ++--
include/linux/blk-mq-dma.h | 7 ++++++-
2 files changed, 8 insertions(+), 3 deletions(-)
diff --git a/block/blk-mq-dma.c b/block/blk-mq-dma.c
index ad283017caef2..51e7a0ff045f9 100644
--- a/block/blk-mq-dma.c
+++ b/block/blk-mq-dma.c
@@ -10,7 +10,7 @@ struct phys_vec {
u32 len;
};
-static bool blk_map_iter_next(struct request *req, struct req_iterator *iter,
+static bool blk_map_iter_next(struct request *req, struct blk_map_iter *iter,
struct phys_vec *vec)
{
unsigned int max_size;
@@ -246,7 +246,7 @@ blk_next_sg(struct scatterlist **sg, struct scatterlist *sglist)
int __blk_rq_map_sg(struct request *rq, struct scatterlist *sglist,
struct scatterlist **last_sg)
{
- struct req_iterator iter = {
+ struct blk_map_iter iter = {
.bio = rq->bio,
};
struct phys_vec vec;
diff --git a/include/linux/blk-mq-dma.h b/include/linux/blk-mq-dma.h
index c26a01aeae006..6a7e3828673d7 100644
--- a/include/linux/blk-mq-dma.h
+++ b/include/linux/blk-mq-dma.h
@@ -5,6 +5,11 @@
#include <linux/blk-mq.h>
#include <linux/pci-p2pdma.h>
+struct blk_map_iter {
+ struct bvec_iter iter;
+ struct bio *bio;
+};
+
struct blk_dma_iter {
/* Output address range for this iteration */
dma_addr_t addr;
@@ -14,7 +19,7 @@ struct blk_dma_iter {
blk_status_t status;
/* Internal to blk_rq_dma_map_iter_* */
- struct req_iterator iter;
+ struct blk_map_iter iter;
struct pci_p2pdma_map_state p2pdma;
};
--
2.47.3
^ permalink raw reply related [flat|nested] 26+ messages in thread
* [PATCHv7 2/9] blk-mq-dma: provide the bio_vec array being iterated
2025-08-13 15:31 [PATCHv7 0/9] blk dma iter for integrity metadata Keith Busch
2025-08-13 15:31 ` [PATCHv7 1/9] blk-mq-dma: create blk_map_iter type Keith Busch
@ 2025-08-13 15:31 ` Keith Busch
2025-08-14 10:17 ` Kanchan Joshi
2025-08-13 15:31 ` [PATCHv7 3/9] blk-mq-dma: require unmap caller provide p2p map type Keith Busch
` (9 subsequent siblings)
11 siblings, 1 reply; 26+ messages in thread
From: Keith Busch @ 2025-08-13 15:31 UTC (permalink / raw)
To: linux-block, linux-nvme; +Cc: hch, axboe, joshi.k, Keith Busch
From: Keith Busch <kbusch@kernel.org>
This will make it easier to add different sources of the bvec array,
like for upcoming integrity support, rather than assume to use the bio's
bi_io_vec. It also makes iterating "special" payloads more in common
with iterating normal payloads.
Signed-off-by: Keith Busch <kbusch@kernel.org>
Reviewed-by: Christoph Hellwig <hch@lst.de>
---
block/blk-mq-dma.c | 56 ++++++++++++++++++++++----------------
include/linux/blk-mq-dma.h | 1 +
2 files changed, 34 insertions(+), 23 deletions(-)
diff --git a/block/blk-mq-dma.c b/block/blk-mq-dma.c
index 51e7a0ff045f9..8f41fe740b465 100644
--- a/block/blk-mq-dma.c
+++ b/block/blk-mq-dma.c
@@ -16,23 +16,14 @@ static bool blk_map_iter_next(struct request *req, struct blk_map_iter *iter,
unsigned int max_size;
struct bio_vec bv;
- if (req->rq_flags & RQF_SPECIAL_PAYLOAD) {
- if (!iter->bio)
- return false;
- vec->paddr = bvec_phys(&req->special_vec);
- vec->len = req->special_vec.bv_len;
- iter->bio = NULL;
- return true;
- }
-
if (!iter->iter.bi_size)
return false;
- bv = mp_bvec_iter_bvec(iter->bio->bi_io_vec, iter->iter);
+ bv = mp_bvec_iter_bvec(iter->bvecs, iter->iter);
vec->paddr = bvec_phys(&bv);
max_size = get_max_segment_size(&req->q->limits, vec->paddr, UINT_MAX);
bv.bv_len = min(bv.bv_len, max_size);
- bio_advance_iter_single(iter->bio, &iter->iter, bv.bv_len);
+ bvec_iter_advance_single(iter->bvecs, &iter->iter, bv.bv_len);
/*
* If we are entirely done with this bi_io_vec entry, check if the next
@@ -43,19 +34,20 @@ static bool blk_map_iter_next(struct request *req, struct blk_map_iter *iter,
struct bio_vec next;
if (!iter->iter.bi_size) {
- if (!iter->bio->bi_next)
+ if (!iter->bio || !iter->bio->bi_next)
break;
iter->bio = iter->bio->bi_next;
iter->iter = iter->bio->bi_iter;
+ iter->bvecs = iter->bio->bi_io_vec;
}
- next = mp_bvec_iter_bvec(iter->bio->bi_io_vec, iter->iter);
+ next = mp_bvec_iter_bvec(iter->bvecs, iter->iter);
if (bv.bv_len + next.bv_len > max_size ||
!biovec_phys_mergeable(req->q, &bv, &next))
break;
bv.bv_len += next.bv_len;
- bio_advance_iter_single(iter->bio, &iter->iter, next.bv_len);
+ bvec_iter_advance_single(iter->bvecs, &iter->iter, next.bv_len);
}
vec->len = bv.bv_len;
@@ -125,6 +117,30 @@ static bool blk_rq_dma_map_iova(struct request *req, struct device *dma_dev,
return true;
}
+static inline void blk_rq_map_iter_init(struct request *rq,
+ struct blk_map_iter *iter)
+{
+ struct bio *bio = rq->bio;
+
+ if (rq->rq_flags & RQF_SPECIAL_PAYLOAD) {
+ *iter = (struct blk_map_iter) {
+ .bvecs = &rq->special_vec,
+ .iter = {
+ .bi_size = rq->special_vec.bv_len,
+ }
+ };
+ } else if (bio) {
+ *iter = (struct blk_map_iter) {
+ .bio = bio,
+ .bvecs = bio->bi_io_vec,
+ .iter = bio->bi_iter,
+ };
+ } else {
+ /* the internal flush request may not have bio attached */
+ *iter = (struct blk_map_iter) {};
+ }
+}
+
/**
* blk_rq_dma_map_iter_start - map the first DMA segment for a request
* @req: request to map
@@ -153,8 +169,7 @@ bool blk_rq_dma_map_iter_start(struct request *req, struct device *dma_dev,
unsigned int total_len = blk_rq_payload_bytes(req);
struct phys_vec vec;
- iter->iter.bio = req->bio;
- iter->iter.iter = req->bio->bi_iter;
+ blk_rq_map_iter_init(req, &iter->iter);
memset(&iter->p2pdma, 0, sizeof(iter->p2pdma));
iter->status = BLK_STS_OK;
@@ -246,16 +261,11 @@ blk_next_sg(struct scatterlist **sg, struct scatterlist *sglist)
int __blk_rq_map_sg(struct request *rq, struct scatterlist *sglist,
struct scatterlist **last_sg)
{
- struct blk_map_iter iter = {
- .bio = rq->bio,
- };
+ struct blk_map_iter iter;
struct phys_vec vec;
int nsegs = 0;
- /* the internal flush request may not have bio attached */
- if (iter.bio)
- iter.iter = iter.bio->bi_iter;
-
+ blk_rq_map_iter_init(rq, &iter);
while (blk_map_iter_next(rq, &iter, &vec)) {
*last_sg = blk_next_sg(last_sg, sglist);
sg_set_page(*last_sg, phys_to_page(vec.paddr), vec.len,
diff --git a/include/linux/blk-mq-dma.h b/include/linux/blk-mq-dma.h
index 6a7e3828673d7..e5cb5e46fc928 100644
--- a/include/linux/blk-mq-dma.h
+++ b/include/linux/blk-mq-dma.h
@@ -8,6 +8,7 @@
struct blk_map_iter {
struct bvec_iter iter;
struct bio *bio;
+ struct bio_vec *bvecs;
};
struct blk_dma_iter {
--
2.47.3
^ permalink raw reply related [flat|nested] 26+ messages in thread
* [PATCHv7 3/9] blk-mq-dma: require unmap caller provide p2p map type
2025-08-13 15:31 [PATCHv7 0/9] blk dma iter for integrity metadata Keith Busch
2025-08-13 15:31 ` [PATCHv7 1/9] blk-mq-dma: create blk_map_iter type Keith Busch
2025-08-13 15:31 ` [PATCHv7 2/9] blk-mq-dma: provide the bio_vec array being iterated Keith Busch
@ 2025-08-13 15:31 ` Keith Busch
2025-08-13 15:53 ` Christoph Hellwig
2025-08-14 10:18 ` Kanchan Joshi
2025-08-13 15:31 ` [PATCHv7 4/9] blk-mq: remove REQ_P2PDMA flag Keith Busch
` (8 subsequent siblings)
11 siblings, 2 replies; 26+ messages in thread
From: Keith Busch @ 2025-08-13 15:31 UTC (permalink / raw)
To: linux-block, linux-nvme; +Cc: hch, axboe, joshi.k, Keith Busch
From: Keith Busch <kbusch@kernel.org>
In preparing for integrity dma mappings, we can't rely on the request
flag because data and metadata may have different mapping types.
Signed-off-by: Keith Busch <kbusch@kernel.org>
---
drivers/nvme/host/pci.c | 9 ++++++++-
include/linux/blk-mq-dma.h | 5 +++--
2 files changed, 11 insertions(+), 3 deletions(-)
diff --git a/drivers/nvme/host/pci.c b/drivers/nvme/host/pci.c
index 2c6d9506b1725..111b6bc6c93eb 100644
--- a/drivers/nvme/host/pci.c
+++ b/drivers/nvme/host/pci.c
@@ -261,6 +261,9 @@ enum nvme_iod_flags {
/* single segment dma mapping */
IOD_SINGLE_SEGMENT = 1U << 2,
+
+ /* DMA mapped with PCI_P2PDMA_MAP_BUS_ADDR */
+ IOD_P2P_BUS_ADDR = 1U << 3,
};
struct nvme_dma_vec {
@@ -725,7 +728,8 @@ static void nvme_unmap_data(struct request *req)
return;
}
- if (!blk_rq_dma_unmap(req, dma_dev, &iod->dma_state, iod->total_len)) {
+ if (!blk_rq_dma_unmap(req, dma_dev, &iod->dma_state, iod->total_len,
+ iod->flags & IOD_P2P_BUS_ADDR)) {
if (nvme_pci_cmd_use_sgl(&iod->cmd))
nvme_free_sgls(req);
else
@@ -1000,6 +1004,9 @@ static blk_status_t nvme_map_data(struct request *req)
if (!blk_rq_dma_map_iter_start(req, dev->dev, &iod->dma_state, &iter))
return iter.status;
+ if (iter.p2pdma.map == PCI_P2PDMA_MAP_BUS_ADDR)
+ iod->flags |= IOD_P2P_BUS_ADDR;
+
if (use_sgl == SGL_FORCED ||
(use_sgl == SGL_SUPPORTED &&
(sgl_threshold && nvme_pci_avg_seg_size(req) >= sgl_threshold)))
diff --git a/include/linux/blk-mq-dma.h b/include/linux/blk-mq-dma.h
index e5cb5e46fc928..881880095e0da 100644
--- a/include/linux/blk-mq-dma.h
+++ b/include/linux/blk-mq-dma.h
@@ -47,14 +47,15 @@ static inline bool blk_rq_dma_map_coalesce(struct dma_iova_state *state)
* @dma_dev: device to unmap from
* @state: DMA IOVA state
* @mapped_len: number of bytes to unmap
+ * @is_p2p: true if mapped with PCI_P2PDMA_MAP_BUS_ADDR
*
* Returns %false if the callers need to manually unmap every DMA segment
* mapped using @iter or %true if no work is left to be done.
*/
static inline bool blk_rq_dma_unmap(struct request *req, struct device *dma_dev,
- struct dma_iova_state *state, size_t mapped_len)
+ struct dma_iova_state *state, size_t mapped_len, bool is_p2p)
{
- if (req->cmd_flags & REQ_P2PDMA)
+ if (is_p2p)
return true;
if (dma_use_iova(state)) {
--
2.47.3
^ permalink raw reply related [flat|nested] 26+ messages in thread
* Re: [PATCHv7 3/9] blk-mq-dma: require unmap caller provide p2p map type
2025-08-13 15:31 ` [PATCHv7 3/9] blk-mq-dma: require unmap caller provide p2p map type Keith Busch
@ 2025-08-13 15:53 ` Christoph Hellwig
2025-08-14 10:18 ` Kanchan Joshi
1 sibling, 0 replies; 26+ messages in thread
From: Christoph Hellwig @ 2025-08-13 15:53 UTC (permalink / raw)
To: Keith Busch; +Cc: linux-block, linux-nvme, hch, axboe, joshi.k, Keith Busch
On Wed, Aug 13, 2025 at 08:31:47AM -0700, Keith Busch wrote:
> From: Keith Busch <kbusch@kernel.org>
>
> In preparing for integrity dma mappings, we can't rely on the request
> flag because data and metadata may have different mapping types.
>
> Signed-off-by: Keith Busch <kbusch@kernel.org>
Didn't I review this already? Either way:
Reviewed-by: Christoph Hellwig <hch@lst.de>
^ permalink raw reply [flat|nested] 26+ messages in thread
* Re: [PATCHv7 3/9] blk-mq-dma: require unmap caller provide p2p map type
2025-08-13 15:31 ` [PATCHv7 3/9] blk-mq-dma: require unmap caller provide p2p map type Keith Busch
2025-08-13 15:53 ` Christoph Hellwig
@ 2025-08-14 10:18 ` Kanchan Joshi
1 sibling, 0 replies; 26+ messages in thread
From: Kanchan Joshi @ 2025-08-14 10:18 UTC (permalink / raw)
To: Keith Busch, linux-block, linux-nvme; +Cc: hch, axboe, Keith Busch
Reviewed-by: Kanchan Joshi <joshi.k@samsung.com>
^ permalink raw reply [flat|nested] 26+ messages in thread
* [PATCHv7 4/9] blk-mq: remove REQ_P2PDMA flag
2025-08-13 15:31 [PATCHv7 0/9] blk dma iter for integrity metadata Keith Busch
` (2 preceding siblings ...)
2025-08-13 15:31 ` [PATCHv7 3/9] blk-mq-dma: require unmap caller provide p2p map type Keith Busch
@ 2025-08-13 15:31 ` Keith Busch
2025-08-13 15:53 ` Christoph Hellwig
2025-08-14 10:19 ` Kanchan Joshi
2025-08-13 15:31 ` [PATCHv7 5/9] blk-mq-dma: move common dma start code to a helper Keith Busch
` (7 subsequent siblings)
11 siblings, 2 replies; 26+ messages in thread
From: Keith Busch @ 2025-08-13 15:31 UTC (permalink / raw)
To: linux-block, linux-nvme; +Cc: hch, axboe, joshi.k, Keith Busch
From: Keith Busch <kbusch@kernel.org>
It's not serving any particular purpose. pci_p2pdma_state() already has
all the appropriate checks, so the config and flag checks are not
guarding anything.
Signed-off-by: Keith Busch <kbusch@kernel.org>
---
block/bio.c | 2 +-
block/blk-mq-dma.c | 30 ++++++++++++++----------------
include/linux/blk_types.h | 2 --
3 files changed, 15 insertions(+), 19 deletions(-)
diff --git a/block/bio.c b/block/bio.c
index 3b371a5da159e..44c43b9703875 100644
--- a/block/bio.c
+++ b/block/bio.c
@@ -981,7 +981,7 @@ void __bio_add_page(struct bio *bio, struct page *page,
WARN_ON_ONCE(bio_full(bio, len));
if (is_pci_p2pdma_page(page))
- bio->bi_opf |= REQ_P2PDMA | REQ_NOMERGE;
+ bio->bi_opf |= REQ_NOMERGE;
bvec_set_page(&bio->bi_io_vec[bio->bi_vcnt], page, len, off);
bio->bi_iter.bi_size += len;
diff --git a/block/blk-mq-dma.c b/block/blk-mq-dma.c
index 8f41fe740b465..58defab218823 100644
--- a/block/blk-mq-dma.c
+++ b/block/blk-mq-dma.c
@@ -180,22 +180,20 @@ bool blk_rq_dma_map_iter_start(struct request *req, struct device *dma_dev,
if (!blk_map_iter_next(req, &iter->iter, &vec))
return false;
- if (IS_ENABLED(CONFIG_PCI_P2PDMA) && (req->cmd_flags & REQ_P2PDMA)) {
- switch (pci_p2pdma_state(&iter->p2pdma, dma_dev,
- phys_to_page(vec.paddr))) {
- case PCI_P2PDMA_MAP_BUS_ADDR:
- return blk_dma_map_bus(iter, &vec);
- case PCI_P2PDMA_MAP_THRU_HOST_BRIDGE:
- /*
- * P2P transfers through the host bridge are treated the
- * same as non-P2P transfers below and during unmap.
- */
- req->cmd_flags &= ~REQ_P2PDMA;
- break;
- default:
- iter->status = BLK_STS_INVAL;
- return false;
- }
+ switch (pci_p2pdma_state(&iter->p2pdma, dma_dev,
+ phys_to_page(vec.paddr))) {
+ case PCI_P2PDMA_MAP_BUS_ADDR:
+ return blk_dma_map_bus(iter, &vec);
+ case PCI_P2PDMA_MAP_THRU_HOST_BRIDGE:
+ /*
+ * P2P transfers through the host bridge are treated the
+ * same as non-P2P transfers below and during unmap.
+ */
+ case PCI_P2PDMA_MAP_NONE:
+ break;
+ default:
+ iter->status = BLK_STS_INVAL;
+ return false;
}
if (blk_can_dma_map_iova(req, dma_dev) &&
diff --git a/include/linux/blk_types.h b/include/linux/blk_types.h
index 09b99d52fd365..930daff207df2 100644
--- a/include/linux/blk_types.h
+++ b/include/linux/blk_types.h
@@ -386,7 +386,6 @@ enum req_flag_bits {
__REQ_DRV, /* for driver use */
__REQ_FS_PRIVATE, /* for file system (submitter) use */
__REQ_ATOMIC, /* for atomic write operations */
- __REQ_P2PDMA, /* contains P2P DMA pages */
/*
* Command specific flags, keep last:
*/
@@ -419,7 +418,6 @@ enum req_flag_bits {
#define REQ_DRV (__force blk_opf_t)(1ULL << __REQ_DRV)
#define REQ_FS_PRIVATE (__force blk_opf_t)(1ULL << __REQ_FS_PRIVATE)
#define REQ_ATOMIC (__force blk_opf_t)(1ULL << __REQ_ATOMIC)
-#define REQ_P2PDMA (__force blk_opf_t)(1ULL << __REQ_P2PDMA)
#define REQ_NOUNMAP (__force blk_opf_t)(1ULL << __REQ_NOUNMAP)
--
2.47.3
^ permalink raw reply related [flat|nested] 26+ messages in thread
* Re: [PATCHv7 4/9] blk-mq: remove REQ_P2PDMA flag
2025-08-13 15:31 ` [PATCHv7 4/9] blk-mq: remove REQ_P2PDMA flag Keith Busch
@ 2025-08-13 15:53 ` Christoph Hellwig
2025-08-14 10:19 ` Kanchan Joshi
1 sibling, 0 replies; 26+ messages in thread
From: Christoph Hellwig @ 2025-08-13 15:53 UTC (permalink / raw)
To: Keith Busch; +Cc: linux-block, linux-nvme, hch, axboe, joshi.k, Keith Busch
On Wed, Aug 13, 2025 at 08:31:48AM -0700, Keith Busch wrote:
> From: Keith Busch <kbusch@kernel.org>
>
> It's not serving any particular purpose. pci_p2pdma_state() already has
> all the appropriate checks, so the config and flag checks are not
> guarding anything.
>
> Signed-off-by: Keith Busch <kbusch@kernel.org>
I think I've also reviewed this before, but:
Reviewed-by: Christoph Hellwig <hch@lst.de>
^ permalink raw reply [flat|nested] 26+ messages in thread
* Re: [PATCHv7 4/9] blk-mq: remove REQ_P2PDMA flag
2025-08-13 15:31 ` [PATCHv7 4/9] blk-mq: remove REQ_P2PDMA flag Keith Busch
2025-08-13 15:53 ` Christoph Hellwig
@ 2025-08-14 10:19 ` Kanchan Joshi
1 sibling, 0 replies; 26+ messages in thread
From: Kanchan Joshi @ 2025-08-14 10:19 UTC (permalink / raw)
To: Keith Busch, linux-block, linux-nvme; +Cc: hch, axboe, Keith Busch
Reviewed-by: Kanchan Joshi <joshi.k@samsung.com>
^ permalink raw reply [flat|nested] 26+ messages in thread
* [PATCHv7 5/9] blk-mq-dma: move common dma start code to a helper
2025-08-13 15:31 [PATCHv7 0/9] blk dma iter for integrity metadata Keith Busch
` (3 preceding siblings ...)
2025-08-13 15:31 ` [PATCHv7 4/9] blk-mq: remove REQ_P2PDMA flag Keith Busch
@ 2025-08-13 15:31 ` Keith Busch
2025-08-13 15:53 ` Christoph Hellwig
2025-08-14 10:19 ` Kanchan Joshi
2025-08-13 15:31 ` [PATCHv7 6/9] blk-mq-dma: add scatter-less integrity data DMA mapping Keith Busch
` (6 subsequent siblings)
11 siblings, 2 replies; 26+ messages in thread
From: Keith Busch @ 2025-08-13 15:31 UTC (permalink / raw)
To: linux-block, linux-nvme; +Cc: hch, axboe, joshi.k, Keith Busch
From: Keith Busch <kbusch@kernel.org>
In preparing for dma mapping integrity metadata, move the common dma
setup to a helper.
Signed-off-by: Keith Busch <kbusch@kernel.org>
---
block/blk-mq-dma.c | 59 ++++++++++++++++++++++++++--------------------
1 file changed, 33 insertions(+), 26 deletions(-)
diff --git a/block/blk-mq-dma.c b/block/blk-mq-dma.c
index 58defab218823..31dd8f58f0811 100644
--- a/block/blk-mq-dma.c
+++ b/block/blk-mq-dma.c
@@ -141,35 +141,12 @@ static inline void blk_rq_map_iter_init(struct request *rq,
}
}
-/**
- * blk_rq_dma_map_iter_start - map the first DMA segment for a request
- * @req: request to map
- * @dma_dev: device to map to
- * @state: DMA IOVA state
- * @iter: block layer DMA iterator
- *
- * Start DMA mapping @req to @dma_dev. @state and @iter are provided by the
- * caller and don't need to be initialized. @state needs to be stored for use
- * at unmap time, @iter is only needed at map time.
- *
- * Returns %false if there is no segment to map, including due to an error, or
- * %true ft it did map a segment.
- *
- * If a segment was mapped, the DMA address for it is returned in @iter.addr and
- * the length in @iter.len. If no segment was mapped the status code is
- * returned in @iter.status.
- *
- * The caller can call blk_rq_dma_map_coalesce() to check if further segments
- * need to be mapped after this, or go straight to blk_rq_dma_map_iter_next()
- * to try to map the following segments.
- */
-bool blk_rq_dma_map_iter_start(struct request *req, struct device *dma_dev,
- struct dma_iova_state *state, struct blk_dma_iter *iter)
+static bool blk_dma_map_iter_start(struct request *req, struct device *dma_dev,
+ struct dma_iova_state *state, struct blk_dma_iter *iter,
+ unsigned int total_len)
{
- unsigned int total_len = blk_rq_payload_bytes(req);
struct phys_vec vec;
- blk_rq_map_iter_init(req, &iter->iter);
memset(&iter->p2pdma, 0, sizeof(iter->p2pdma));
iter->status = BLK_STS_OK;
@@ -201,6 +178,36 @@ bool blk_rq_dma_map_iter_start(struct request *req, struct device *dma_dev,
return blk_rq_dma_map_iova(req, dma_dev, state, iter, &vec);
return blk_dma_map_direct(req, dma_dev, iter, &vec);
}
+
+/**
+ * blk_rq_dma_map_iter_start - map the first DMA segment for a request
+ * @req: request to map
+ * @dma_dev: device to map to
+ * @state: DMA IOVA state
+ * @iter: block layer DMA iterator
+ *
+ * Start DMA mapping @req to @dma_dev. @state and @iter are provided by the
+ * caller and don't need to be initialized. @state needs to be stored for use
+ * at unmap time, @iter is only needed at map time.
+ *
+ * Returns %false if there is no segment to map, including due to an error, or
+ * %true ft it did map a segment.
+ *
+ * If a segment was mapped, the DMA address for it is returned in @iter.addr and
+ * the length in @iter.len. If no segment was mapped the status code is
+ * returned in @iter.status.
+ *
+ * The caller can call blk_rq_dma_map_coalesce() to check if further segments
+ * need to be mapped after this, or go straight to blk_rq_dma_map_iter_next()
+ * to try to map the following segments.
+ */
+bool blk_rq_dma_map_iter_start(struct request *req, struct device *dma_dev,
+ struct dma_iova_state *state, struct blk_dma_iter *iter)
+{
+ blk_rq_map_iter_init(req, &iter->iter);
+ return blk_dma_map_iter_start(req, dma_dev, state, iter,
+ blk_rq_payload_bytes(req));
+}
EXPORT_SYMBOL_GPL(blk_rq_dma_map_iter_start);
/**
--
2.47.3
^ permalink raw reply related [flat|nested] 26+ messages in thread
* Re: [PATCHv7 5/9] blk-mq-dma: move common dma start code to a helper
2025-08-13 15:31 ` [PATCHv7 5/9] blk-mq-dma: move common dma start code to a helper Keith Busch
@ 2025-08-13 15:53 ` Christoph Hellwig
2025-08-14 10:19 ` Kanchan Joshi
1 sibling, 0 replies; 26+ messages in thread
From: Christoph Hellwig @ 2025-08-13 15:53 UTC (permalink / raw)
To: Keith Busch; +Cc: linux-block, linux-nvme, hch, axboe, joshi.k, Keith Busch
Looks good:
Reviewed-by: Christoph Hellwig <hch@lst.de>
^ permalink raw reply [flat|nested] 26+ messages in thread
* Re: [PATCHv7 5/9] blk-mq-dma: move common dma start code to a helper
2025-08-13 15:31 ` [PATCHv7 5/9] blk-mq-dma: move common dma start code to a helper Keith Busch
2025-08-13 15:53 ` Christoph Hellwig
@ 2025-08-14 10:19 ` Kanchan Joshi
1 sibling, 0 replies; 26+ messages in thread
From: Kanchan Joshi @ 2025-08-14 10:19 UTC (permalink / raw)
To: Keith Busch, linux-block, linux-nvme; +Cc: hch, axboe, Keith Busch
Reviewed-by: Kanchan Joshi <joshi.k@samsung.com>
^ permalink raw reply [flat|nested] 26+ messages in thread
* [PATCHv7 6/9] blk-mq-dma: add scatter-less integrity data DMA mapping
2025-08-13 15:31 [PATCHv7 0/9] blk dma iter for integrity metadata Keith Busch
` (4 preceding siblings ...)
2025-08-13 15:31 ` [PATCHv7 5/9] blk-mq-dma: move common dma start code to a helper Keith Busch
@ 2025-08-13 15:31 ` Keith Busch
2025-08-13 15:55 ` Christoph Hellwig
2025-08-14 10:36 ` Kanchan Joshi
2025-08-13 15:31 ` [PATCHv7 7/9] blk-integrity: use iterator for mapping sg Keith Busch
` (5 subsequent siblings)
11 siblings, 2 replies; 26+ messages in thread
From: Keith Busch @ 2025-08-13 15:31 UTC (permalink / raw)
To: linux-block, linux-nvme; +Cc: hch, axboe, joshi.k, Keith Busch
From: Keith Busch <kbusch@kernel.org>
Similar to regular data, introduce more efficient integrity mapping
helpers that does away with the scatterlist structure. This uses the
block mapping iterator to add IOVA segments if IOMMU is enabled, or maps
directly if not. This also supports P2P segements if integrity data ever
wants to allocate that type of memory.
Signed-off-by: Keith Busch <kbusch@kernel.org>
---
block/blk-mq-dma.c | 104 +++++++++++++++++++++++++++++++---
include/linux/blk-integrity.h | 17 ++++++
include/linux/blk-mq-dma.h | 1 +
3 files changed, 115 insertions(+), 7 deletions(-)
diff --git a/block/blk-mq-dma.c b/block/blk-mq-dma.c
index 31dd8f58f0811..60a244a129c3c 100644
--- a/block/blk-mq-dma.c
+++ b/block/blk-mq-dma.c
@@ -2,6 +2,7 @@
/*
* Copyright (C) 2025 Christoph Hellwig
*/
+#include <linux/blk-integrity.h>
#include <linux/blk-mq-dma.h>
#include "blk.h"
@@ -10,6 +11,24 @@ struct phys_vec {
u32 len;
};
+static bool __blk_map_iter_next(struct blk_map_iter *iter)
+{
+ if (iter->iter.bi_size)
+ return true;
+ if (!iter->bio || !iter->bio->bi_next)
+ return false;
+
+ iter->bio = iter->bio->bi_next;
+ if (iter->is_integrity) {
+ iter->iter = bio_integrity(iter->bio)->bip_iter;
+ iter->bvecs = bio_integrity(iter->bio)->bip_vec;
+ } else {
+ iter->iter = iter->bio->bi_iter;
+ iter->bvecs = iter->bio->bi_io_vec;
+ }
+ return true;
+}
+
static bool blk_map_iter_next(struct request *req, struct blk_map_iter *iter,
struct phys_vec *vec)
{
@@ -33,13 +52,8 @@ static bool blk_map_iter_next(struct request *req, struct blk_map_iter *iter,
while (!iter->iter.bi_size || !iter->iter.bi_bvec_done) {
struct bio_vec next;
- if (!iter->iter.bi_size) {
- if (!iter->bio || !iter->bio->bi_next)
- break;
- iter->bio = iter->bio->bi_next;
- iter->iter = iter->bio->bi_iter;
- iter->bvecs = iter->bio->bi_io_vec;
- }
+ if (!__blk_map_iter_next(iter))
+ break;
next = mp_bvec_iter_bvec(iter->bvecs, iter->iter);
if (bv.bv_len + next.bv_len > max_size ||
@@ -290,3 +304,79 @@ int __blk_rq_map_sg(struct request *rq, struct scatterlist *sglist,
return nsegs;
}
EXPORT_SYMBOL(__blk_rq_map_sg);
+
+#ifdef CONFIG_BLK_DEV_INTEGRITY
+/**
+ * blk_rq_integrity_dma_map_iter_start - map the first integrity DMA segment
+ * for a request
+ * @req: request to map
+ * @dma_dev: device to map to
+ * @state: DMA IOVA state
+ * @iter: block layer DMA iterator
+ *
+ * Start DMA mapping @req integrity data to @dma_dev. @state and @iter are
+ * provided by the caller and don't need to be initialized. @state needs to be
+ * stored for use at unmap time, @iter is only needed at map time.
+ *
+ * Returns %false if there is no segment to map, including due to an error, or
+ * %true if it did map a segment.
+ *
+ * If a segment was mapped, the DMA address for it is returned in @iter.addr
+ * and the length in @iter.len. If no segment was mapped the status code is
+ * returned in @iter.status.
+ *
+ * The caller can call blk_rq_dma_map_coalesce() to check if further segments
+ * need to be mapped after this, or go straight to blk_rq_dma_map_iter_next()
+ * to try to map the following segments.
+ */
+bool blk_rq_integrity_dma_map_iter_start(struct request *req,
+ struct device *dma_dev, struct dma_iova_state *state,
+ struct blk_dma_iter *iter)
+{
+ unsigned len = bio_integrity_bytes(&req->q->limits.integrity,
+ blk_rq_sectors(req));
+ struct bio *bio = req->bio;
+
+ iter->iter = (struct blk_map_iter) {
+ .bio = bio,
+ .iter = bio_integrity(bio)->bip_iter,
+ .bvecs = bio_integrity(bio)->bip_vec,
+ .is_integrity = true,
+ };
+ return blk_dma_map_iter_start(req, dma_dev, state, iter, len);
+}
+EXPORT_SYMBOL_GPL(blk_rq_integrity_dma_map_iter_start);
+
+/**
+ * blk_rq_integrity_dma_map_iter_start - map the next integrity DMA segment for
+ * a request
+ * @req: request to map
+ * @dma_dev: device to map to
+ * @state: DMA IOVA state
+ * @iter: block layer DMA iterator
+ *
+ * Iterate to the next integrity mapping after a previous call to
+ * blk_rq_integrity_dma_map_iter_start(). See there for a detailed description
+ * of the arguments.
+ *
+ * Returns %false if there is no segment to map, including due to an error, or
+ * %true if it did map a segment.
+ *
+ * If a segment was mapped, the DMA address for it is returned in @iter.addr and
+ * the length in @iter.len. If no segment was mapped the status code is
+ * returned in @iter.status.
+ */
+bool blk_rq_integrity_dma_map_iter_next(struct request *req,
+ struct device *dma_dev, struct blk_dma_iter *iter)
+{
+ struct phys_vec vec;
+
+ if (!blk_map_iter_next(req, &iter->iter, &vec))
+ return false;
+
+ if (iter->p2pdma.map == PCI_P2PDMA_MAP_BUS_ADDR)
+ return blk_dma_map_bus(iter, &vec);
+ return blk_dma_map_direct(req, dma_dev, iter, &vec);
+}
+EXPORT_SYMBOL_GPL(blk_rq_integrity_dma_map_iter_next);
+#endif
diff --git a/include/linux/blk-integrity.h b/include/linux/blk-integrity.h
index e67a2b6e8f111..78fe2459e6612 100644
--- a/include/linux/blk-integrity.h
+++ b/include/linux/blk-integrity.h
@@ -4,6 +4,7 @@
#include <linux/blk-mq.h>
#include <linux/bio-integrity.h>
+#include <linux/blk-mq-dma.h>
struct request;
@@ -31,6 +32,11 @@ int blk_rq_integrity_map_user(struct request *rq, void __user *ubuf,
ssize_t bytes);
int blk_get_meta_cap(struct block_device *bdev, unsigned int cmd,
struct logical_block_metadata_cap __user *argp);
+bool blk_rq_integrity_dma_map_iter_start(struct request *req,
+ struct device *dma_dev, struct dma_iova_state *state,
+ struct blk_dma_iter *iter);
+bool blk_rq_integrity_dma_map_iter_next(struct request *req,
+ struct device *dma_dev, struct blk_dma_iter *iter);
static inline bool
blk_integrity_queue_supports_integrity(struct request_queue *q)
@@ -115,6 +121,17 @@ static inline int blk_rq_integrity_map_user(struct request *rq,
{
return -EINVAL;
}
+static inline bool blk_rq_integrity_dma_map_iter_start(struct request *req,
+ struct device *dma_dev, struct dma_iova_state *state,
+ struct blk_dma_iter *iter)
+{
+ return false;
+}
+static inline bool blk_rq_integrity_dma_map_iter_next(struct request *req,
+ struct device *dma_dev, struct blk_dma_iter *iter)
+{
+ return false;
+}
static inline struct blk_integrity *bdev_get_integrity(struct block_device *b)
{
return NULL;
diff --git a/include/linux/blk-mq-dma.h b/include/linux/blk-mq-dma.h
index 881880095e0da..0f45ea110ca12 100644
--- a/include/linux/blk-mq-dma.h
+++ b/include/linux/blk-mq-dma.h
@@ -9,6 +9,7 @@ struct blk_map_iter {
struct bvec_iter iter;
struct bio *bio;
struct bio_vec *bvecs;
+ bool is_integrity;
};
struct blk_dma_iter {
--
2.47.3
^ permalink raw reply related [flat|nested] 26+ messages in thread
* Re: [PATCHv7 6/9] blk-mq-dma: add scatter-less integrity data DMA mapping
2025-08-13 15:31 ` [PATCHv7 6/9] blk-mq-dma: add scatter-less integrity data DMA mapping Keith Busch
@ 2025-08-13 15:55 ` Christoph Hellwig
2025-08-14 10:36 ` Kanchan Joshi
1 sibling, 0 replies; 26+ messages in thread
From: Christoph Hellwig @ 2025-08-13 15:55 UTC (permalink / raw)
To: Keith Busch; +Cc: linux-block, linux-nvme, hch, axboe, joshi.k, Keith Busch
I got two copies of this which both look the same.
.. and good, so:
Reviewed-by: Christoph Hellwig <hch@lst.de>
^ permalink raw reply [flat|nested] 26+ messages in thread
* Re: [PATCHv7 6/9] blk-mq-dma: add scatter-less integrity data DMA mapping
2025-08-13 15:31 ` [PATCHv7 6/9] blk-mq-dma: add scatter-less integrity data DMA mapping Keith Busch
2025-08-13 15:55 ` Christoph Hellwig
@ 2025-08-14 10:36 ` Kanchan Joshi
1 sibling, 0 replies; 26+ messages in thread
From: Kanchan Joshi @ 2025-08-14 10:36 UTC (permalink / raw)
To: Keith Busch, linux-block, linux-nvme; +Cc: hch, axboe, Keith Busch
On 8/13/2025 9:01 PM, Keith Busch wrote:
> + * blk_rq_integrity_dma_map_iter_start - map the next integrity DMA segment for
> + * a request
> + * @req: request to map
> + * @dma_dev: device to map to
> + * @state: DMA IOVA state
> + * @iter: block layer DMA iterator
> + *
> + * Iterate to the next integrity mapping after a previous call to
> + * blk_rq_integrity_dma_map_iter_start(). See there for a detailed description
> + * of the arguments.
> + *
> + * Returns %false if there is no segment to map, including due to an error, or
> + * %true if it did map a segment.
> + *
> + * If a segment was mapped, the DMA address for it is returned in @iter.addr and
> + * the length in @iter.len. If no segment was mapped the status code is
> + * returned in @iter.status.
> + */
> +bool blk_rq_integrity_dma_map_iter_next(struct request *req,
> + struct device *dma_dev, struct blk_dma_iter *iter)
> +{
The function comment should also use the name
"blk_rq_integrity_dma_map_iter_next".
Otherwise, looks good.
Reviewed-by: Kanchan Joshi <joshi.k@samsung.com>
^ permalink raw reply [flat|nested] 26+ messages in thread
* [PATCHv7 7/9] blk-integrity: use iterator for mapping sg
2025-08-13 15:31 [PATCHv7 0/9] blk dma iter for integrity metadata Keith Busch
` (5 preceding siblings ...)
2025-08-13 15:31 ` [PATCHv7 6/9] blk-mq-dma: add scatter-less integrity data DMA mapping Keith Busch
@ 2025-08-13 15:31 ` Keith Busch
2025-08-13 15:55 ` Christoph Hellwig
2025-08-13 16:53 ` Martin K. Petersen
2025-08-13 15:31 ` [PATCHv7 8/9] nvme-pci: create common sgl unmapping helper Keith Busch
` (4 subsequent siblings)
11 siblings, 2 replies; 26+ messages in thread
From: Keith Busch @ 2025-08-13 15:31 UTC (permalink / raw)
To: linux-block, linux-nvme; +Cc: hch, axboe, joshi.k, Keith Busch
From: Keith Busch <kbusch@kernel.org>
Modify blk_rq_map_integrity_sg to use the blk-mq mapping iterator. This
produces more efficient code and converges the integrity mapping
implementations to reduce future maintenance burdens.
The function implementation moves from blk-integrity.c to blk-mq-dma.c
in order to use the types and functions private to that file.
Signed-off-by: Keith Busch <kbusch@kernel.org>
---
block/blk-integrity.c | 58 -------------------------------------------
block/blk-mq-dma.c | 45 +++++++++++++++++++++++++++++++++
2 files changed, 45 insertions(+), 58 deletions(-)
diff --git a/block/blk-integrity.c b/block/blk-integrity.c
index 056b8948369d5..dd97b27366e0e 100644
--- a/block/blk-integrity.c
+++ b/block/blk-integrity.c
@@ -122,64 +122,6 @@ int blk_get_meta_cap(struct block_device *bdev, unsigned int cmd,
NULL);
}
-/**
- * blk_rq_map_integrity_sg - Map integrity metadata into a scatterlist
- * @rq: request to map
- * @sglist: target scatterlist
- *
- * Description: Map the integrity vectors in request into a
- * scatterlist. The scatterlist must be big enough to hold all
- * elements. I.e. sized using blk_rq_count_integrity_sg() or
- * rq->nr_integrity_segments.
- */
-int blk_rq_map_integrity_sg(struct request *rq, struct scatterlist *sglist)
-{
- struct bio_vec iv, ivprv = { NULL };
- struct request_queue *q = rq->q;
- struct scatterlist *sg = NULL;
- struct bio *bio = rq->bio;
- unsigned int segments = 0;
- struct bvec_iter iter;
- int prev = 0;
-
- bio_for_each_integrity_vec(iv, bio, iter) {
- if (prev) {
- if (!biovec_phys_mergeable(q, &ivprv, &iv))
- goto new_segment;
- if (sg->length + iv.bv_len > queue_max_segment_size(q))
- goto new_segment;
-
- sg->length += iv.bv_len;
- } else {
-new_segment:
- if (!sg)
- sg = sglist;
- else {
- sg_unmark_end(sg);
- sg = sg_next(sg);
- }
-
- sg_set_page(sg, iv.bv_page, iv.bv_len, iv.bv_offset);
- segments++;
- }
-
- prev = 1;
- ivprv = iv;
- }
-
- if (sg)
- sg_mark_end(sg);
-
- /*
- * Something must have been wrong if the figured number of segment
- * is bigger than number of req's physical integrity segments
- */
- BUG_ON(segments > rq->nr_integrity_segments);
- BUG_ON(segments > queue_max_integrity_segments(q));
- return segments;
-}
-EXPORT_SYMBOL(blk_rq_map_integrity_sg);
-
int blk_rq_integrity_map_user(struct request *rq, void __user *ubuf,
ssize_t bytes)
{
diff --git a/block/blk-mq-dma.c b/block/blk-mq-dma.c
index 60a244a129c3c..660b5e200ccf6 100644
--- a/block/blk-mq-dma.c
+++ b/block/blk-mq-dma.c
@@ -379,4 +379,49 @@ bool blk_rq_integrity_dma_map_iter_next(struct request *req,
return blk_dma_map_direct(req, dma_dev, iter, &vec);
}
EXPORT_SYMBOL_GPL(blk_rq_integrity_dma_map_iter_next);
+
+/**
+ * blk_rq_map_integrity_sg - Map integrity metadata into a scatterlist
+ * @rq: request to map
+ * @sglist: target scatterlist
+ *
+ * Description: Map the integrity vectors in request into a
+ * scatterlist. The scatterlist must be big enough to hold all
+ * elements. I.e. sized using blk_rq_count_integrity_sg() or
+ * rq->nr_integrity_segments.
+ */
+int blk_rq_map_integrity_sg(struct request *rq, struct scatterlist *sglist)
+{
+ struct request_queue *q = rq->q;
+ struct scatterlist *sg = NULL;
+ struct bio *bio = rq->bio;
+ unsigned int segments = 0;
+ struct phys_vec vec;
+
+ struct blk_map_iter iter = {
+ .bio = bio,
+ .iter = bio_integrity(bio)->bip_iter,
+ .bvecs = bio_integrity(bio)->bip_vec,
+ .is_integrity = true,
+ };
+
+ while (blk_map_iter_next(rq, &iter, &vec)) {
+ sg = blk_next_sg(&sg, sglist);
+ sg_set_page(sg, phys_to_page(vec.paddr), vec.len,
+ offset_in_page(vec.paddr));
+ segments++;
+ }
+
+ if (sg)
+ sg_mark_end(sg);
+
+ /*
+ * Something must have been wrong if the figured number of segment
+ * is bigger than number of req's physical integrity segments
+ */
+ BUG_ON(segments > rq->nr_integrity_segments);
+ BUG_ON(segments > queue_max_integrity_segments(q));
+ return segments;
+}
+EXPORT_SYMBOL(blk_rq_map_integrity_sg);
#endif
--
2.47.3
^ permalink raw reply related [flat|nested] 26+ messages in thread
* Re: [PATCHv7 7/9] blk-integrity: use iterator for mapping sg
2025-08-13 15:31 ` [PATCHv7 7/9] blk-integrity: use iterator for mapping sg Keith Busch
@ 2025-08-13 15:55 ` Christoph Hellwig
2025-08-13 16:53 ` Martin K. Petersen
1 sibling, 0 replies; 26+ messages in thread
From: Christoph Hellwig @ 2025-08-13 15:55 UTC (permalink / raw)
To: Keith Busch; +Cc: linux-block, linux-nvme, hch, axboe, joshi.k, Keith Busch
On Wed, Aug 13, 2025 at 08:31:51AM -0700, Keith Busch wrote:
> From: Keith Busch <kbusch@kernel.org>
>
> Modify blk_rq_map_integrity_sg to use the blk-mq mapping iterator. This
> produces more efficient code and converges the integrity mapping
> implementations to reduce future maintenance burdens.
>
> The function implementation moves from blk-integrity.c to blk-mq-dma.c
> in order to use the types and functions private to that file.
>
> Signed-off-by: Keith Busch <kbusch@kernel.org>
> ---
> block/blk-integrity.c | 58 -------------------------------------------
> block/blk-mq-dma.c | 45 +++++++++++++++++++++++++++++++++
> 2 files changed, 45 insertions(+), 58 deletions(-)
>
> diff --git a/block/blk-integrity.c b/block/blk-integrity.c
> index 056b8948369d5..dd97b27366e0e 100644
> --- a/block/blk-integrity.c
> +++ b/block/blk-integrity.c
> @@ -122,64 +122,6 @@ int blk_get_meta_cap(struct block_device *bdev, unsigned int cmd,
> NULL);
> }
>
> -/**
> - * blk_rq_map_integrity_sg - Map integrity metadata into a scatterlist
> - * @rq: request to map
> - * @sglist: target scatterlist
> - *
> - * Description: Map the integrity vectors in request into a
> - * scatterlist. The scatterlist must be big enough to hold all
> - * elements. I.e. sized using blk_rq_count_integrity_sg() or
> - * rq->nr_integrity_segments.
> - */
> -int blk_rq_map_integrity_sg(struct request *rq, struct scatterlist *sglist)
> -{
> - struct bio_vec iv, ivprv = { NULL };
> - struct request_queue *q = rq->q;
> - struct scatterlist *sg = NULL;
> - struct bio *bio = rq->bio;
> - unsigned int segments = 0;
> - struct bvec_iter iter;
> - int prev = 0;
> -
> - bio_for_each_integrity_vec(iv, bio, iter) {
> - if (prev) {
> - if (!biovec_phys_mergeable(q, &ivprv, &iv))
> - goto new_segment;
> - if (sg->length + iv.bv_len > queue_max_segment_size(q))
> - goto new_segment;
> -
> - sg->length += iv.bv_len;
> - } else {
> -new_segment:
> - if (!sg)
> - sg = sglist;
> - else {
> - sg_unmark_end(sg);
> - sg = sg_next(sg);
> - }
> -
> - sg_set_page(sg, iv.bv_page, iv.bv_len, iv.bv_offset);
> - segments++;
> - }
> -
> - prev = 1;
> - ivprv = iv;
> - }
> -
> - if (sg)
> - sg_mark_end(sg);
> -
> - /*
> - * Something must have been wrong if the figured number of segment
> - * is bigger than number of req's physical integrity segments
> - */
> - BUG_ON(segments > rq->nr_integrity_segments);
> - BUG_ON(segments > queue_max_integrity_segments(q));
> - return segments;
> -}
> -EXPORT_SYMBOL(blk_rq_map_integrity_sg);
> -
> int blk_rq_integrity_map_user(struct request *rq, void __user *ubuf,
> ssize_t bytes)
> {
> diff --git a/block/blk-mq-dma.c b/block/blk-mq-dma.c
> index 60a244a129c3c..660b5e200ccf6 100644
> --- a/block/blk-mq-dma.c
> +++ b/block/blk-mq-dma.c
> @@ -379,4 +379,49 @@ bool blk_rq_integrity_dma_map_iter_next(struct request *req,
> return blk_dma_map_direct(req, dma_dev, iter, &vec);
> }
> EXPORT_SYMBOL_GPL(blk_rq_integrity_dma_map_iter_next);
> +
> +/**
> + * blk_rq_map_integrity_sg - Map integrity metadata into a scatterlist
> + * @rq: request to map
> + * @sglist: target scatterlist
> + *
> + * Description: Map the integrity vectors in request into a
> + * scatterlist. The scatterlist must be big enough to hold all
> + * elements. I.e. sized using blk_rq_count_integrity_sg() or
> + * rq->nr_integrity_segments.
> + */
> +int blk_rq_map_integrity_sg(struct request *rq, struct scatterlist *sglist)
> +{
> + struct request_queue *q = rq->q;
> + struct scatterlist *sg = NULL;
> + struct bio *bio = rq->bio;
> + unsigned int segments = 0;
> + struct phys_vec vec;
> +
> + struct blk_map_iter iter = {
The empty line above is a bit odd.
Otherwise this looks good:
Reviewed-by: Christoph Hellwig <hch@lst.de>
^ permalink raw reply [flat|nested] 26+ messages in thread
* Re: [PATCHv7 7/9] blk-integrity: use iterator for mapping sg
2025-08-13 15:31 ` [PATCHv7 7/9] blk-integrity: use iterator for mapping sg Keith Busch
2025-08-13 15:55 ` Christoph Hellwig
@ 2025-08-13 16:53 ` Martin K. Petersen
1 sibling, 0 replies; 26+ messages in thread
From: Martin K. Petersen @ 2025-08-13 16:53 UTC (permalink / raw)
To: Keith Busch; +Cc: linux-block, linux-nvme, hch, axboe, joshi.k, Keith Busch
Keith,
> Modify blk_rq_map_integrity_sg to use the blk-mq mapping iterator.
> This produces more efficient code and converges the integrity mapping
> implementations to reduce future maintenance burdens.
>
> The function implementation moves from blk-integrity.c to blk-mq-dma.c
> in order to use the types and functions private to that file.
Very nice!
Reviewed-by: Martin K. Petersen <martin.petersen@oracle.com>
--
Martin K. Petersen
^ permalink raw reply [flat|nested] 26+ messages in thread
* [PATCHv7 8/9] nvme-pci: create common sgl unmapping helper
2025-08-13 15:31 [PATCHv7 0/9] blk dma iter for integrity metadata Keith Busch
` (6 preceding siblings ...)
2025-08-13 15:31 ` [PATCHv7 7/9] blk-integrity: use iterator for mapping sg Keith Busch
@ 2025-08-13 15:31 ` Keith Busch
2025-08-13 15:56 ` Christoph Hellwig
2025-08-13 15:31 ` [PATCHv7 9/9] nvme-pci: convert metadata mapping to dma iter Keith Busch
` (3 subsequent siblings)
11 siblings, 1 reply; 26+ messages in thread
From: Keith Busch @ 2025-08-13 15:31 UTC (permalink / raw)
To: linux-block, linux-nvme; +Cc: hch, axboe, joshi.k, Keith Busch
From: Keith Busch <kbusch@kernel.org>
This can be reused by metadata sgls once that starts using the blk-mq
dma api.
Signed-off-by: Keith Busch <kbusch@kernel.org>
Reviewed-by: Christoph Hellwig <hch@lst.de>
---
drivers/nvme/host/pci.c | 29 ++++++++++++++---------------
1 file changed, 14 insertions(+), 15 deletions(-)
diff --git a/drivers/nvme/host/pci.c b/drivers/nvme/host/pci.c
index 111b6bc6c93eb..e12d47fecc584 100644
--- a/drivers/nvme/host/pci.c
+++ b/drivers/nvme/host/pci.c
@@ -693,25 +693,23 @@ static void nvme_free_prps(struct request *req)
mempool_free(iod->dma_vecs, nvmeq->dev->dmavec_mempool);
}
-static void nvme_free_sgls(struct request *req)
+static void nvme_free_sgls(struct request *req, struct nvme_sgl_desc *sge,
+ struct nvme_sgl_desc *sg_list)
{
- struct nvme_iod *iod = blk_mq_rq_to_pdu(req);
struct nvme_queue *nvmeq = req->mq_hctx->driver_data;
- struct device *dma_dev = nvmeq->dev->dev;
- dma_addr_t sqe_dma_addr = le64_to_cpu(iod->cmd.common.dptr.sgl.addr);
- unsigned int sqe_dma_len = le32_to_cpu(iod->cmd.common.dptr.sgl.length);
- struct nvme_sgl_desc *sg_list = iod->descriptors[0];
enum dma_data_direction dir = rq_dma_dir(req);
+ unsigned int len = le32_to_cpu(sge->length);
+ struct device *dma_dev = nvmeq->dev->dev;
+ unsigned int i;
- if (iod->nr_descriptors) {
- unsigned int nr_entries = sqe_dma_len / sizeof(*sg_list), i;
-
- for (i = 0; i < nr_entries; i++)
- dma_unmap_page(dma_dev, le64_to_cpu(sg_list[i].addr),
- le32_to_cpu(sg_list[i].length), dir);
- } else {
- dma_unmap_page(dma_dev, sqe_dma_addr, sqe_dma_len, dir);
+ if (sge->type == (NVME_SGL_FMT_DATA_DESC << 4)) {
+ dma_unmap_page(dma_dev, le64_to_cpu(sge->addr), len, dir);
+ return;
}
+
+ for (i = 0; i < len / sizeof(*sg_list); i++)
+ dma_unmap_page(dma_dev, le64_to_cpu(sg_list[i].addr),
+ le32_to_cpu(sg_list[i].length), dir);
}
static void nvme_unmap_data(struct request *req)
@@ -731,7 +729,8 @@ static void nvme_unmap_data(struct request *req)
if (!blk_rq_dma_unmap(req, dma_dev, &iod->dma_state, iod->total_len,
iod->flags & IOD_P2P_BUS_ADDR)) {
if (nvme_pci_cmd_use_sgl(&iod->cmd))
- nvme_free_sgls(req);
+ nvme_free_sgls(req, iod->descriptors[0],
+ &iod->cmd.common.dptr.sgl);
else
nvme_free_prps(req);
}
--
2.47.3
^ permalink raw reply related [flat|nested] 26+ messages in thread
* [PATCHv7 9/9] nvme-pci: convert metadata mapping to dma iter
2025-08-13 15:31 [PATCHv7 0/9] blk dma iter for integrity metadata Keith Busch
` (7 preceding siblings ...)
2025-08-13 15:31 ` [PATCHv7 8/9] nvme-pci: create common sgl unmapping helper Keith Busch
@ 2025-08-13 15:31 ` Keith Busch
2025-08-13 16:57 ` [PATCHv7 0/9] blk dma iter for integrity metadata Martin K. Petersen
` (2 subsequent siblings)
11 siblings, 0 replies; 26+ messages in thread
From: Keith Busch @ 2025-08-13 15:31 UTC (permalink / raw)
To: linux-block, linux-nvme; +Cc: hch, axboe, joshi.k, Keith Busch
From: Keith Busch <kbusch@kernel.org>
Aligns data and metadata to the similar dma mapping scheme and removes
one more user of the scatter-gather dma mapping.
Signed-off-by: Keith Busch <kbusch@kernel.org>
Reviewed-by: Christoph Hellwig <hch@lst.de>
---
drivers/nvme/host/pci.c | 163 +++++++++++++++++++++-------------------
1 file changed, 87 insertions(+), 76 deletions(-)
diff --git a/drivers/nvme/host/pci.c b/drivers/nvme/host/pci.c
index e12d47fecc584..d8a9dee55de33 100644
--- a/drivers/nvme/host/pci.c
+++ b/drivers/nvme/host/pci.c
@@ -172,9 +172,7 @@ struct nvme_dev {
u32 last_ps;
bool hmb;
struct sg_table *hmb_sgt;
-
mempool_t *dmavec_mempool;
- mempool_t *iod_meta_mempool;
/* shadow doorbell buffer support: */
__le32 *dbbuf_dbs;
@@ -264,6 +262,12 @@ enum nvme_iod_flags {
/* DMA mapped with PCI_P2PDMA_MAP_BUS_ADDR */
IOD_P2P_BUS_ADDR = 1U << 3,
+
+ /* Metadata DMA mapped with PCI_P2PDMA_MAP_BUS_ADDR */
+ IOD_META_P2P_BUS_ADDR = 1U << 4,
+
+ /* Metadata using non-coalesced MPTR */
+ IOD_SINGLE_META_SEGMENT = 1U << 5,
};
struct nvme_dma_vec {
@@ -287,7 +291,8 @@ struct nvme_iod {
unsigned int nr_dma_vecs;
dma_addr_t meta_dma;
- struct sg_table meta_sgt;
+ unsigned int meta_total_len;
+ struct dma_iova_state meta_dma_state;
struct nvme_sgl_desc *meta_descriptor;
};
@@ -644,6 +649,11 @@ static inline struct dma_pool *nvme_dma_pool(struct nvme_queue *nvmeq,
return nvmeq->descriptor_pools.large;
}
+static inline bool nvme_pci_cmd_use_meta_sgl(struct nvme_command *cmd)
+{
+ return (cmd->common.flags & NVME_CMD_SGL_ALL) == NVME_CMD_SGL_METASEG;
+}
+
static inline bool nvme_pci_cmd_use_sgl(struct nvme_command *cmd)
{
return cmd->common.flags &
@@ -712,6 +722,36 @@ static void nvme_free_sgls(struct request *req, struct nvme_sgl_desc *sge,
le32_to_cpu(sg_list[i].length), dir);
}
+static void nvme_unmap_metadata(struct request *req)
+{
+ struct nvme_queue *nvmeq = req->mq_hctx->driver_data;
+ enum dma_data_direction dir = rq_dma_dir(req);
+ struct nvme_iod *iod = blk_mq_rq_to_pdu(req);
+ struct device *dma_dev = nvmeq->dev->dev;
+ struct nvme_sgl_desc *sge = iod->meta_descriptor;
+
+ if (iod->flags & IOD_SINGLE_META_SEGMENT) {
+ dma_unmap_page(dma_dev, iod->meta_dma,
+ rq_integrity_vec(req).bv_len,
+ rq_dma_dir(req));
+ return;
+ }
+
+ if (!blk_rq_dma_unmap(req, dma_dev, &iod->meta_dma_state,
+ iod->meta_total_len,
+ iod->flags & IOD_META_P2P_BUS_ADDR)) {
+ if (nvme_pci_cmd_use_meta_sgl(&iod->cmd))
+ nvme_free_sgls(req, sge, &sge[1]);
+ else
+ dma_unmap_page(dma_dev, iod->meta_dma,
+ iod->meta_total_len, dir);
+ }
+
+ if (iod->meta_descriptor)
+ dma_pool_free(nvmeq->descriptor_pools.small,
+ iod->meta_descriptor, iod->meta_dma);
+}
+
static void nvme_unmap_data(struct request *req)
{
struct nvme_iod *iod = blk_mq_rq_to_pdu(req);
@@ -1013,70 +1053,72 @@ static blk_status_t nvme_map_data(struct request *req)
return nvme_pci_setup_data_prp(req, &iter);
}
-static void nvme_pci_sgl_set_data_sg(struct nvme_sgl_desc *sge,
- struct scatterlist *sg)
-{
- sge->addr = cpu_to_le64(sg_dma_address(sg));
- sge->length = cpu_to_le32(sg_dma_len(sg));
- sge->type = NVME_SGL_FMT_DATA_DESC << 4;
-}
-
static blk_status_t nvme_pci_setup_meta_sgls(struct request *req)
{
struct nvme_queue *nvmeq = req->mq_hctx->driver_data;
- struct nvme_dev *dev = nvmeq->dev;
+ unsigned int entries = req->nr_integrity_segments;
struct nvme_iod *iod = blk_mq_rq_to_pdu(req);
+ struct nvme_dev *dev = nvmeq->dev;
struct nvme_sgl_desc *sg_list;
- struct scatterlist *sgl, *sg;
- unsigned int entries;
+ struct blk_dma_iter iter;
dma_addr_t sgl_dma;
- int rc, i;
+ int i = 0;
- iod->meta_sgt.sgl = mempool_alloc(dev->iod_meta_mempool, GFP_ATOMIC);
- if (!iod->meta_sgt.sgl)
- return BLK_STS_RESOURCE;
+ if (!blk_rq_integrity_dma_map_iter_start(req, dev->dev,
+ &iod->meta_dma_state, &iter))
+ return iter.status;
- sg_init_table(iod->meta_sgt.sgl, req->nr_integrity_segments);
- iod->meta_sgt.orig_nents = blk_rq_map_integrity_sg(req,
- iod->meta_sgt.sgl);
- if (!iod->meta_sgt.orig_nents)
- goto out_free_sg;
+ if (iter.p2pdma.map == PCI_P2PDMA_MAP_BUS_ADDR)
+ iod->flags |= IOD_META_P2P_BUS_ADDR;
+ else if (blk_rq_dma_map_coalesce(&iod->meta_dma_state))
+ entries = 1;
- rc = dma_map_sgtable(dev->dev, &iod->meta_sgt, rq_dma_dir(req),
- DMA_ATTR_NO_WARN);
- if (rc)
- goto out_free_sg;
+ /*
+ * The NVMe MPTR descriptor has an implicit length that the host and
+ * device must agree on to avoid data/memory corruption. We trust the
+ * kernel allocated correctly based on the format's parameters, so use
+ * the more efficient MPTR to avoid extra dma pool allocations for the
+ * SGL indirection.
+ *
+ * But for user commands, we don't necessarily know what they do, so
+ * the driver can't validate the metadata buffer size. The SGL
+ * descriptor provides an explicit length, so we're relying on that
+ * mechanism to catch any misunderstandings between the application and
+ * device.
+ */
+ if (entries == 1 && !(nvme_req(req)->flags & NVME_REQ_USERCMD)) {
+ iod->cmd.common.metadata = cpu_to_le64(iter.addr);
+ iod->meta_total_len = iter.len;
+ iod->meta_dma = iter.addr;
+ iod->meta_descriptor = NULL;
+ return BLK_STS_OK;
+ }
sg_list = dma_pool_alloc(nvmeq->descriptor_pools.small, GFP_ATOMIC,
&sgl_dma);
if (!sg_list)
- goto out_unmap_sg;
+ return BLK_STS_RESOURCE;
- entries = iod->meta_sgt.nents;
iod->meta_descriptor = sg_list;
iod->meta_dma = sgl_dma;
-
iod->cmd.common.flags = NVME_CMD_SGL_METASEG;
iod->cmd.common.metadata = cpu_to_le64(sgl_dma);
-
- sgl = iod->meta_sgt.sgl;
if (entries == 1) {
- nvme_pci_sgl_set_data_sg(sg_list, sgl);
+ iod->meta_total_len = iter.len;
+ nvme_pci_sgl_set_data(sg_list, &iter);
return BLK_STS_OK;
}
sgl_dma += sizeof(*sg_list);
- nvme_pci_sgl_set_seg(sg_list, sgl_dma, entries);
- for_each_sg(sgl, sg, entries, i)
- nvme_pci_sgl_set_data_sg(&sg_list[i + 1], sg);
-
- return BLK_STS_OK;
+ do {
+ nvme_pci_sgl_set_data(&sg_list[++i], &iter);
+ iod->meta_total_len += iter.len;
+ } while (blk_rq_integrity_dma_map_iter_next(req, dev->dev, &iter));
-out_unmap_sg:
- dma_unmap_sgtable(dev->dev, &iod->meta_sgt, rq_dma_dir(req), 0);
-out_free_sg:
- mempool_free(iod->meta_sgt.sgl, dev->iod_meta_mempool);
- return BLK_STS_RESOURCE;
+ nvme_pci_sgl_set_seg(sg_list, sgl_dma, i);
+ if (unlikely(iter.status))
+ nvme_unmap_metadata(req);
+ return iter.status;
}
static blk_status_t nvme_pci_setup_meta_mptr(struct request *req)
@@ -1089,6 +1131,7 @@ static blk_status_t nvme_pci_setup_meta_mptr(struct request *req)
if (dma_mapping_error(nvmeq->dev->dev, iod->meta_dma))
return BLK_STS_IOERR;
iod->cmd.common.metadata = cpu_to_le64(iod->meta_dma);
+ iod->flags |= IOD_SINGLE_META_SEGMENT;
return BLK_STS_OK;
}
@@ -1110,7 +1153,7 @@ static blk_status_t nvme_prep_rq(struct request *req)
iod->flags = 0;
iod->nr_descriptors = 0;
iod->total_len = 0;
- iod->meta_sgt.nents = 0;
+ iod->meta_total_len = 0;
ret = nvme_setup_cmd(req->q->queuedata, req);
if (ret)
@@ -1221,25 +1264,6 @@ static void nvme_queue_rqs(struct rq_list *rqlist)
*rqlist = requeue_list;
}
-static __always_inline void nvme_unmap_metadata(struct request *req)
-{
- struct nvme_iod *iod = blk_mq_rq_to_pdu(req);
- struct nvme_queue *nvmeq = req->mq_hctx->driver_data;
- struct nvme_dev *dev = nvmeq->dev;
-
- if (!iod->meta_sgt.nents) {
- dma_unmap_page(dev->dev, iod->meta_dma,
- rq_integrity_vec(req).bv_len,
- rq_dma_dir(req));
- return;
- }
-
- dma_pool_free(nvmeq->descriptor_pools.small, iod->meta_descriptor,
- iod->meta_dma);
- dma_unmap_sgtable(dev->dev, &iod->meta_sgt, rq_dma_dir(req), 0);
- mempool_free(iod->meta_sgt.sgl, dev->iod_meta_mempool);
-}
-
static __always_inline void nvme_pci_unmap_rq(struct request *req)
{
if (blk_integrity_rq(req))
@@ -3045,7 +3069,6 @@ static int nvme_disable_prepare_reset(struct nvme_dev *dev, bool shutdown)
static int nvme_pci_alloc_iod_mempool(struct nvme_dev *dev)
{
- size_t meta_size = sizeof(struct scatterlist) * (NVME_MAX_META_SEGS + 1);
size_t alloc_size = sizeof(struct nvme_dma_vec) * NVME_MAX_SEGS;
dev->dmavec_mempool = mempool_create_node(1,
@@ -3054,17 +3077,7 @@ static int nvme_pci_alloc_iod_mempool(struct nvme_dev *dev)
dev_to_node(dev->dev));
if (!dev->dmavec_mempool)
return -ENOMEM;
-
- dev->iod_meta_mempool = mempool_create_node(1,
- mempool_kmalloc, mempool_kfree,
- (void *)meta_size, GFP_KERNEL,
- dev_to_node(dev->dev));
- if (!dev->iod_meta_mempool)
- goto free;
return 0;
-free:
- mempool_destroy(dev->dmavec_mempool);
- return -ENOMEM;
}
static void nvme_free_tagset(struct nvme_dev *dev)
@@ -3514,7 +3527,6 @@ static int nvme_probe(struct pci_dev *pdev, const struct pci_device_id *id)
nvme_free_queues(dev, 0);
out_release_iod_mempool:
mempool_destroy(dev->dmavec_mempool);
- mempool_destroy(dev->iod_meta_mempool);
out_dev_unmap:
nvme_dev_unmap(dev);
out_uninit_ctrl:
@@ -3578,7 +3590,6 @@ static void nvme_remove(struct pci_dev *pdev)
nvme_dbbuf_dma_free(dev);
nvme_free_queues(dev, 0);
mempool_destroy(dev->dmavec_mempool);
- mempool_destroy(dev->iod_meta_mempool);
nvme_release_descriptor_pools(dev);
nvme_dev_unmap(dev);
nvme_uninit_ctrl(&dev->ctrl);
--
2.47.3
^ permalink raw reply related [flat|nested] 26+ messages in thread
* Re: [PATCHv7 0/9] blk dma iter for integrity metadata
2025-08-13 15:31 [PATCHv7 0/9] blk dma iter for integrity metadata Keith Busch
` (8 preceding siblings ...)
2025-08-13 15:31 ` [PATCHv7 9/9] nvme-pci: convert metadata mapping to dma iter Keith Busch
@ 2025-08-13 16:57 ` Martin K. Petersen
2025-08-25 7:55 ` Christoph Hellwig
2025-08-25 13:47 ` Jens Axboe
11 siblings, 0 replies; 26+ messages in thread
From: Martin K. Petersen @ 2025-08-13 16:57 UTC (permalink / raw)
To: Keith Busch; +Cc: linux-block, linux-nvme, hch, axboe, joshi.k, Keith Busch
Keith,
> - implemented it in blk-mq-dma.c instead of blk-integrity.c to avoid
> having to shuffling functions and common types around
Very much in favor of aligning the data and integrity mapping paths.
The whole series looks good to me.
Reviewed-by: Martin K. Petersen <martin.petersen@oracle.com>
--
Martin K. Petersen
^ permalink raw reply [flat|nested] 26+ messages in thread
* Re: [PATCHv7 0/9] blk dma iter for integrity metadata
2025-08-13 15:31 [PATCHv7 0/9] blk dma iter for integrity metadata Keith Busch
` (9 preceding siblings ...)
2025-08-13 16:57 ` [PATCHv7 0/9] blk dma iter for integrity metadata Martin K. Petersen
@ 2025-08-25 7:55 ` Christoph Hellwig
2025-08-25 13:47 ` Jens Axboe
11 siblings, 0 replies; 26+ messages in thread
From: Christoph Hellwig @ 2025-08-25 7:55 UTC (permalink / raw)
To: Keith Busch; +Cc: linux-block, linux-nvme, hch, axboe, joshi.k, Keith Busch
Jens, anything blocking this?
^ permalink raw reply [flat|nested] 26+ messages in thread
* Re: [PATCHv7 0/9] blk dma iter for integrity metadata
2025-08-13 15:31 [PATCHv7 0/9] blk dma iter for integrity metadata Keith Busch
` (10 preceding siblings ...)
2025-08-25 7:55 ` Christoph Hellwig
@ 2025-08-25 13:47 ` Jens Axboe
11 siblings, 0 replies; 26+ messages in thread
From: Jens Axboe @ 2025-08-25 13:47 UTC (permalink / raw)
To: linux-block, linux-nvme, Keith Busch; +Cc: hch, joshi.k, Keith Busch
On Wed, 13 Aug 2025 08:31:44 -0700, Keith Busch wrote:
> Previous version:
>
> https://lore.kernel.org/linux-block/20250812135210.4172178-1-kbusch@meta.com/
>
> Changes since v6 addressing review feeback from Christoph:
>
> - Moved the integrity sg conversion to its own patch
>
> [...]
Applied, thanks!
[1/9] blk-mq-dma: create blk_map_iter type
(no commit info)
[2/9] blk-mq-dma: provide the bio_vec array being iterated
(no commit info)
[3/9] blk-mq-dma: require unmap caller provide p2p map type
(no commit info)
[4/9] blk-mq: remove REQ_P2PDMA flag
(no commit info)
[5/9] blk-mq-dma: move common dma start code to a helper
(no commit info)
[6/9] blk-mq-dma: add scatter-less integrity data DMA mapping
(no commit info)
[7/9] blk-integrity: use iterator for mapping sg
(no commit info)
[8/9] nvme-pci: create common sgl unmapping helper
(no commit info)
[9/9] nvme-pci: convert metadata mapping to dma iter
(no commit info)
Best regards,
--
Jens Axboe
^ permalink raw reply [flat|nested] 26+ messages in thread