* [PATCH] nvme-pci: always use blk_map_iter for metadata
@ 2025-10-20 18:24 Keith Busch
2025-10-20 23:38 ` Chaitanya Kulkarni
2025-10-21 7:19 ` Christoph Hellwig
0 siblings, 2 replies; 7+ messages in thread
From: Keith Busch @ 2025-10-20 18:24 UTC (permalink / raw)
To: linux-nvme, hch; +Cc: Keith Busch, Leon Romanovsky
From: Keith Busch <kbusch@kernel.org>
The dma_map_bvec helper doesn't work for p2p data. Rather than special
case it, just use the same mapping logic so that the driver doesn't need
to consider memory types.
Reported-by: Leon Romanovsky <leon@kernel.org>
Signed-off-by: Keith Busch <kbusch@kernel.org>
---
drivers/nvme/host/pci.c | 89 ++++++++++++++---------------------------
1 file changed, 31 insertions(+), 58 deletions(-)
diff --git a/drivers/nvme/host/pci.c b/drivers/nvme/host/pci.c
index c916176bd9f05..3acb5843de450 100644
--- a/drivers/nvme/host/pci.c
+++ b/drivers/nvme/host/pci.c
@@ -259,9 +259,6 @@ enum nvme_iod_flags {
/* single segment dma mapping */
IOD_SINGLE_SEGMENT = 1U << 2,
-
- /* Metadata using non-coalesced MPTR */
- IOD_SINGLE_META_SEGMENT = 1U << 5,
};
struct nvme_dma_vec {
@@ -596,17 +593,6 @@ enum nvme_use_sgl {
SGL_FORCED,
};
-static inline bool nvme_pci_metadata_use_sgls(struct request *req)
-{
- struct nvme_queue *nvmeq = req->mq_hctx->driver_data;
- struct nvme_dev *dev = nvmeq->dev;
-
- if (!nvme_ctrl_meta_sgl_supported(&dev->ctrl))
- return false;
- return req->nr_integrity_segments > 1 ||
- nvme_req(req)->flags & NVME_REQ_USERCMD;
-}
-
static inline enum nvme_use_sgl nvme_pci_use_sgls(struct nvme_dev *dev,
struct request *req)
{
@@ -724,13 +710,6 @@ static void nvme_unmap_metadata(struct request *req)
struct device *dma_dev = nvmeq->dev->dev;
struct nvme_sgl_desc *sge = iod->meta_descriptor;
- if (iod->flags & IOD_SINGLE_META_SEGMENT) {
- dma_unmap_page(dma_dev, iod->meta_dma,
- rq_integrity_vec(req).bv_len,
- rq_dma_dir(req));
- return;
- }
-
if (!blk_rq_integrity_dma_unmap(req, dma_dev, &iod->meta_dma_state,
iod->meta_total_len)) {
if (nvme_pci_cmd_use_meta_sgl(&iod->cmd))
@@ -1042,21 +1021,27 @@ static blk_status_t nvme_map_data(struct request *req)
return nvme_pci_setup_data_prp(req, &iter);
}
-static blk_status_t nvme_pci_setup_meta_sgls(struct request *req)
+static blk_status_t nvme_pci_setup_meta_mptr(struct nvme_iod *iod,
+ struct blk_dma_iter *iter)
+{
+ iod->cmd.common.metadata = cpu_to_le64(iter->addr);
+ iod->meta_total_len = iter->len;
+ iod->meta_dma = iter->addr;
+ iod->meta_descriptor = NULL;
+ return BLK_STS_OK;
+}
+
+static blk_status_t nvme_pci_setup_meta_sgls(struct request *req,
+ struct blk_dma_iter *iter)
{
struct nvme_queue *nvmeq = req->mq_hctx->driver_data;
unsigned int entries = req->nr_integrity_segments;
struct nvme_iod *iod = blk_mq_rq_to_pdu(req);
struct nvme_dev *dev = nvmeq->dev;
struct nvme_sgl_desc *sg_list;
- struct blk_dma_iter iter;
dma_addr_t sgl_dma;
int i = 0;
- if (!blk_rq_integrity_dma_map_iter_start(req, dev->dev,
- &iod->meta_dma_state, &iter))
- return iter.status;
-
if (blk_rq_dma_map_coalesce(&iod->meta_dma_state))
entries = 1;
@@ -1073,13 +1058,8 @@ static blk_status_t nvme_pci_setup_meta_sgls(struct request *req)
* mechanism to catch any misunderstandings between the application and
* device.
*/
- if (entries == 1 && !(nvme_req(req)->flags & NVME_REQ_USERCMD)) {
- iod->cmd.common.metadata = cpu_to_le64(iter.addr);
- iod->meta_total_len = iter.len;
- iod->meta_dma = iter.addr;
- iod->meta_descriptor = NULL;
- return BLK_STS_OK;
- }
+ if (entries == 1 && !(nvme_req(req)->flags & NVME_REQ_USERCMD))
+ return nvme_pci_setup_meta_mptr(iod, iter);
sg_list = dma_pool_alloc(nvmeq->descriptor_pools.small, GFP_ATOMIC,
&sgl_dma);
@@ -1091,45 +1071,38 @@ static blk_status_t nvme_pci_setup_meta_sgls(struct request *req)
iod->cmd.common.flags = NVME_CMD_SGL_METASEG;
iod->cmd.common.metadata = cpu_to_le64(sgl_dma);
if (entries == 1) {
- iod->meta_total_len = iter.len;
- nvme_pci_sgl_set_data(sg_list, &iter);
+ iod->meta_total_len = iter->len;
+ nvme_pci_sgl_set_data(sg_list, iter);
return BLK_STS_OK;
}
sgl_dma += sizeof(*sg_list);
do {
- nvme_pci_sgl_set_data(&sg_list[++i], &iter);
- iod->meta_total_len += iter.len;
- } while (blk_rq_integrity_dma_map_iter_next(req, dev->dev, &iter));
+ nvme_pci_sgl_set_data(&sg_list[++i], iter);
+ iod->meta_total_len += iter->len;
+ } while (blk_rq_integrity_dma_map_iter_next(req, dev->dev, iter));
nvme_pci_sgl_set_seg(sg_list, sgl_dma, i);
- if (unlikely(iter.status))
+ if (unlikely(iter->status))
nvme_unmap_metadata(req);
- return iter.status;
+ return iter->status;
}
-static blk_status_t nvme_pci_setup_meta_mptr(struct request *req)
+static blk_status_t nvme_map_metadata(struct request *req)
{
- struct nvme_iod *iod = blk_mq_rq_to_pdu(req);
struct nvme_queue *nvmeq = req->mq_hctx->driver_data;
- struct bio_vec bv = rq_integrity_vec(req);
+ struct nvme_iod *iod = blk_mq_rq_to_pdu(req);
+ struct nvme_dev *dev = nvmeq->dev;
+ struct blk_dma_iter iter;
- iod->meta_dma = dma_map_bvec(nvmeq->dev->dev, &bv, rq_dma_dir(req), 0);
- if (dma_mapping_error(nvmeq->dev->dev, iod->meta_dma))
- return BLK_STS_IOERR;
- iod->cmd.common.metadata = cpu_to_le64(iod->meta_dma);
- iod->flags |= IOD_SINGLE_META_SEGMENT;
- return BLK_STS_OK;
-}
+ if (blk_rq_integrity_dma_map_iter_start(req, dev->dev,
+ &iod->meta_dma_state, &iter))
+ return iter.status;
-static blk_status_t nvme_map_metadata(struct request *req)
-{
- struct nvme_iod *iod = blk_mq_rq_to_pdu(req);
+ if (!nvme_ctrl_meta_sgl_supported(&dev->ctrl))
+ return nvme_pci_setup_meta_mptr(iod, &iter);
- if ((iod->cmd.common.flags & NVME_CMD_SGL_METABUF) &&
- nvme_pci_metadata_use_sgls(req))
- return nvme_pci_setup_meta_sgls(req);
- return nvme_pci_setup_meta_mptr(req);
+ return nvme_pci_setup_meta_sgls(req, &iter);
}
static blk_status_t nvme_prep_rq(struct request *req)
--
2.47.3
^ permalink raw reply related [flat|nested] 7+ messages in thread
* Re: [PATCH] nvme-pci: always use blk_map_iter for metadata
2025-10-20 18:24 [PATCH] nvme-pci: always use blk_map_iter for metadata Keith Busch
@ 2025-10-20 23:38 ` Chaitanya Kulkarni
2025-10-21 7:19 ` Christoph Hellwig
1 sibling, 0 replies; 7+ messages in thread
From: Chaitanya Kulkarni @ 2025-10-20 23:38 UTC (permalink / raw)
To: Keith Busch, linux-nvme@lists.infradead.org, hch@lst.de
Cc: Keith Busch, Leon Romanovsky
On 10/20/25 11:24, Keith Busch wrote:
> From: Keith Busch<kbusch@kernel.org>
>
> The dma_map_bvec helper doesn't work for p2p data. Rather than special
> case it, just use the same mapping logic so that the driver doesn't need
> to consider memory types.
>
> Reported-by: Leon Romanovsky<leon@kernel.org>
> Signed-off-by: Keith Busch<kbusch@kernel.org>
Looks good.
Reviewed-by: Chaitanya Kulkarni <kch@nvidia.com>
-ck
^ permalink raw reply [flat|nested] 7+ messages in thread
* Re: [PATCH] nvme-pci: always use blk_map_iter for metadata
2025-10-20 18:24 [PATCH] nvme-pci: always use blk_map_iter for metadata Keith Busch
2025-10-20 23:38 ` Chaitanya Kulkarni
@ 2025-10-21 7:19 ` Christoph Hellwig
2025-10-21 14:42 ` Keith Busch
1 sibling, 1 reply; 7+ messages in thread
From: Christoph Hellwig @ 2025-10-21 7:19 UTC (permalink / raw)
To: Keith Busch; +Cc: linux-nvme, hch, Keith Busch, Leon Romanovsky
On Mon, Oct 20, 2025 at 11:24:44AM -0700, Keith Busch wrote:
> From: Keith Busch <kbusch@kernel.org>
>
> The dma_map_bvec helper doesn't work for p2p data. Rather than special
> case it, just use the same mapping logic so that the driver doesn't need
> to consider memory types.
We already consider the memory types for the data path, so treating the
metadasta path where p2p is even more unlikely sounds like the wrong
tradeoff. If we have a single segment we just need a single
is_pci_p2pdma_page check to skip direct mapping path. Something like
this untested patch:
diff --git a/drivers/nvme/host/pci.c b/drivers/nvme/host/pci.c
index c916176bd9f0..c8cfcc64d5ca 100644
--- a/drivers/nvme/host/pci.c
+++ b/drivers/nvme/host/pci.c
@@ -1042,7 +1042,7 @@ static blk_status_t nvme_map_data(struct request *req)
return nvme_pci_setup_data_prp(req, &iter);
}
-static blk_status_t nvme_pci_setup_meta_sgls(struct request *req)
+static blk_status_t nvme_pci_setup_meta_iter(struct request *req)
{
struct nvme_queue *nvmeq = req->mq_hctx->driver_data;
unsigned int entries = req->nr_integrity_segments;
@@ -1073,7 +1073,9 @@ static blk_status_t nvme_pci_setup_meta_sgls(struct request *req)
* mechanism to catch any misunderstandings between the application and
* device.
*/
- if (entries == 1 && !(nvme_req(req)->flags & NVME_REQ_USERCMD)) {
+ if (entries == 1 &&
+ !(nvme_req(req)->flags & NVME_REQ_USERCMD) &&
+ nvme_ctrl_meta_sgl_supported(&dev->ctrl)) {
iod->cmd.common.metadata = cpu_to_le64(iter.addr);
iod->meta_total_len = iter.len;
iod->meta_dma = iter.addr;
@@ -1125,10 +1127,12 @@ static blk_status_t nvme_pci_setup_meta_mptr(struct request *req)
static blk_status_t nvme_map_metadata(struct request *req)
{
struct nvme_iod *iod = blk_mq_rq_to_pdu(req);
+ struct bio_vec bv = req_bvec(req);
- if ((iod->cmd.common.flags & NVME_CMD_SGL_METABUF) &&
- nvme_pci_metadata_use_sgls(req))
- return nvme_pci_setup_meta_sgls(req);
+ if (((iod->cmd.common.flags & NVME_CMD_SGL_METABUF) &&
+ nvme_pci_metadata_use_sgls(req)) ||
+ is_pci_p2pdma_page(bv.bv_page))
+ return nvme_pci_setup_meta_iter(req);
return nvme_pci_setup_meta_mptr(req);
}
^ permalink raw reply related [flat|nested] 7+ messages in thread
* Re: [PATCH] nvme-pci: always use blk_map_iter for metadata
2025-10-21 7:19 ` Christoph Hellwig
@ 2025-10-21 14:42 ` Keith Busch
2025-10-26 7:25 ` Leon Romanovsky
0 siblings, 1 reply; 7+ messages in thread
From: Keith Busch @ 2025-10-21 14:42 UTC (permalink / raw)
To: Christoph Hellwig; +Cc: Keith Busch, linux-nvme, Leon Romanovsky
On Tue, Oct 21, 2025 at 09:19:35AM +0200, Christoph Hellwig wrote:
> On Mon, Oct 20, 2025 at 11:24:44AM -0700, Keith Busch wrote:
> > From: Keith Busch <kbusch@kernel.org>
> >
> > The dma_map_bvec helper doesn't work for p2p data. Rather than special
> > case it, just use the same mapping logic so that the driver doesn't need
> > to consider memory types.
>
> We already consider the memory types for the data path, so treating the
> metadasta path where p2p is even more unlikely sounds like the wrong
> tradeoff.
Maybe the data path should only use blk_dma_iter too. Is dma_map_bvec
that much faster for the single vector case? I'm going to test both on
real hardware and see if there's a difference in CPU utilization or
latency.
> If we have a single segment we just need a single
> is_pci_p2pdma_page check to skip direct mapping path. Something like
> this untested patch:
The logic you used isn't accurate, but I know what you mean. I'll spin a
more minimal v2 patch.
^ permalink raw reply [flat|nested] 7+ messages in thread
* Re: [PATCH] nvme-pci: always use blk_map_iter for metadata
2025-10-21 14:42 ` Keith Busch
@ 2025-10-26 7:25 ` Leon Romanovsky
2025-10-27 7:13 ` Christoph Hellwig
0 siblings, 1 reply; 7+ messages in thread
From: Leon Romanovsky @ 2025-10-26 7:25 UTC (permalink / raw)
To: Keith Busch; +Cc: Christoph Hellwig, Keith Busch, linux-nvme
On Tue, Oct 21, 2025 at 08:42:06AM -0600, Keith Busch wrote:
> On Tue, Oct 21, 2025 at 09:19:35AM +0200, Christoph Hellwig wrote:
> > On Mon, Oct 20, 2025 at 11:24:44AM -0700, Keith Busch wrote:
> > > From: Keith Busch <kbusch@kernel.org>
> > >
> > > The dma_map_bvec helper doesn't work for p2p data. Rather than special
> > > case it, just use the same mapping logic so that the driver doesn't need
> > > to consider memory types.
> >
> > We already consider the memory types for the data path, so treating the
> > metadasta path where p2p is even more unlikely sounds like the wrong
> > tradeoff.
>
> Maybe the data path should only use blk_dma_iter too. Is dma_map_bvec
> that much faster for the single vector case? I'm going to test both on
> real hardware and see if there's a difference in CPU utilization or
> latency.
We tried that path (remove special case for single segment), but for
very performance oriented case: direct mode without IOMMU, no p2p and
extremely powerful RAID (~94M IOPS), Jens and Kanchan saw performance
drop. On consumer grade HW (~2M IOPS), the performance change was not
noticed.
Thanks
^ permalink raw reply [flat|nested] 7+ messages in thread
* Re: [PATCH] nvme-pci: always use blk_map_iter for metadata
2025-10-26 7:25 ` Leon Romanovsky
@ 2025-10-27 7:13 ` Christoph Hellwig
2025-10-27 16:09 ` Keith Busch
0 siblings, 1 reply; 7+ messages in thread
From: Christoph Hellwig @ 2025-10-27 7:13 UTC (permalink / raw)
To: Leon Romanovsky; +Cc: Keith Busch, Christoph Hellwig, Keith Busch, linux-nvme
On Sun, Oct 26, 2025 at 09:25:51AM +0200, Leon Romanovsky wrote:
> > Maybe the data path should only use blk_dma_iter too. Is dma_map_bvec
> > that much faster for the single vector case? I'm going to test both on
> > real hardware and see if there's a difference in CPU utilization or
> > latency.
>
> We tried that path (remove special case for single segment), but for
> very performance oriented case: direct mode without IOMMU, no p2p and
> extremely powerful RAID (~94M IOPS), Jens and Kanchan saw performance
> drop. On consumer grade HW (~2M IOPS), the performance change was not
> noticed.
Yes. Sorry, I wanted to reply something like this to Keith earlier but
managed to drop the ball.
^ permalink raw reply [flat|nested] 7+ messages in thread
* Re: [PATCH] nvme-pci: always use blk_map_iter for metadata
2025-10-27 7:13 ` Christoph Hellwig
@ 2025-10-27 16:09 ` Keith Busch
0 siblings, 0 replies; 7+ messages in thread
From: Keith Busch @ 2025-10-27 16:09 UTC (permalink / raw)
To: Christoph Hellwig; +Cc: Leon Romanovsky, Keith Busch, linux-nvme
On Mon, Oct 27, 2025 at 08:13:53AM +0100, Christoph Hellwig wrote:
> On Sun, Oct 26, 2025 at 09:25:51AM +0200, Leon Romanovsky wrote:
> > > Maybe the data path should only use blk_dma_iter too. Is dma_map_bvec
> > > that much faster for the single vector case? I'm going to test both on
> > > real hardware and see if there's a difference in CPU utilization or
> > > latency.
> >
> > We tried that path (remove special case for single segment), but for
> > very performance oriented case: direct mode without IOMMU, no p2p and
> > extremely powerful RAID (~94M IOPS), Jens and Kanchan saw performance
> > drop. On consumer grade HW (~2M IOPS), the performance change was not
> > noticed.
>
> Yes. Sorry, I wanted to reply something like this to Keith earlier but
> managed to drop the ball.
Yeah, no problem. I was reaching the same conclusion anyway and moved on.
^ permalink raw reply [flat|nested] 7+ messages in thread
end of thread, other threads:[~2025-10-27 16:09 UTC | newest]
Thread overview: 7+ messages (download: mbox.gz follow: Atom feed
-- links below jump to the message on this page --
2025-10-20 18:24 [PATCH] nvme-pci: always use blk_map_iter for metadata Keith Busch
2025-10-20 23:38 ` Chaitanya Kulkarni
2025-10-21 7:19 ` Christoph Hellwig
2025-10-21 14:42 ` Keith Busch
2025-10-26 7:25 ` Leon Romanovsky
2025-10-27 7:13 ` Christoph Hellwig
2025-10-27 16:09 ` Keith Busch
This is a public inbox, see mirroring instructions
for how to clone and mirror all data and code used for this inbox