[PATCH] nvme: don't set a virt_boundary unless needed

public inbox for linux-nvme@lists.infradead.org
 help / color / mirror / Atom feed

* [PATCH] nvme: don't set a virt_boundary unless needed
@ 2023-12-21  8:48 Christoph Hellwig
  2023-12-21  9:30 ` Sagi Grimberg
  0 siblings, 1 reply; 13+ messages in thread
From: Christoph Hellwig @ 2023-12-21  8:48 UTC (permalink / raw)
  To: marcan, sven, kbusch, axboe, sagi, james.smart
  Cc: alyssa, asahi, linux-nvme, kch

NVMe PRPs are a pain and force the expensive virt_boundary checking on
block layer, prevent secure passthrough and require scatter/gather I/O
to be split into multiple commands which is problematic for the upcoming
atomic write support.

Fix the NVMe core to require an opt-in from the drivers for it.

For nvme-apple it is always required as the driver only supports PRPs.

For nvme-pci when SGLs are supported we'll always use them for data I/O
that would require a virt_boundary.

For nvme-rdma the virt boundary is always required, as RMDA MRs are just
as dumb as NVMe PRPs.

For nvme-tcp and nvme-fc I set the flags for now because I don't
understand the drivers fully, but I suspect the flags could be lifted.

For nvme-loop the flag is never set as it doesn't have any requirements
on the I/O format.

Signed-off-by: Christoph Hellwig <hch@lst.de>
---
 drivers/nvme/host/apple.c |  6 +++++
 drivers/nvme/host/core.c  | 11 ++++++++-
 drivers/nvme/host/fc.c    |  3 +++
 drivers/nvme/host/nvme.h  |  4 +++
 drivers/nvme/host/pci.c   | 52 ++++++++++++++++++++++-----------------
 drivers/nvme/host/rdma.c  |  6 +++++
 drivers/nvme/host/tcp.c   |  3 +++
 7 files changed, 61 insertions(+), 24 deletions(-)

diff --git a/drivers/nvme/host/apple.c b/drivers/nvme/host/apple.c
index 596bb11eeba5a9..a1afb54e3b4da8 100644
--- a/drivers/nvme/host/apple.c
+++ b/drivers/nvme/host/apple.c
@@ -1116,6 +1116,12 @@ static void apple_nvme_reset_work(struct work_struct *work)
 		goto out;
 	}
 
+	/*
+	 * nvme-apple always uses PRPs and thus needs to set a virt boundary.
+	 */
+	set_bit(NVME_CTRL_VIRT_BOUNDARY_IO, &anv->ctrl.flags);
+	set_bit(NVME_CTRL_VIRT_BOUNDARY_ADMIN, &anv->ctrl.flags);
+
 	ret = nvme_init_ctrl_finish(&anv->ctrl, false);
 	if (ret)
 		goto out;
diff --git a/drivers/nvme/host/core.c b/drivers/nvme/host/core.c
index 22dae2a26ba493..e2816c588c65d1 100644
--- a/drivers/nvme/host/core.c
+++ b/drivers/nvme/host/core.c
@@ -1872,6 +1872,14 @@ static int nvme_configure_metadata(struct nvme_ctrl *ctrl,
 	return 0;
 }
 
+static bool nvme_need_virt_boundary(struct nvme_ctrl *ctrl,
+		struct request_queue *q)
+{
+	if (q == ctrl->admin_q)
+		return test_bit(NVME_CTRL_VIRT_BOUNDARY_ADMIN, &ctrl->flags);
+	return test_bit(NVME_CTRL_VIRT_BOUNDARY_IO, &ctrl->flags);
+}
+
 static void nvme_set_queue_limits(struct nvme_ctrl *ctrl,
 		struct request_queue *q)
 {
@@ -1885,7 +1893,8 @@ static void nvme_set_queue_limits(struct nvme_ctrl *ctrl,
 		blk_queue_max_hw_sectors(q, ctrl->max_hw_sectors);
 		blk_queue_max_segments(q, min_t(u32, max_segments, USHRT_MAX));
 	}
-	blk_queue_virt_boundary(q, NVME_CTRL_PAGE_SIZE - 1);
+	if (nvme_need_virt_boundary(ctrl, q))
+		blk_queue_virt_boundary(q, NVME_CTRL_PAGE_SIZE - 1);
 	blk_queue_dma_alignment(q, 3);
 	blk_queue_write_cache(q, vwc, vwc);
 }
diff --git a/drivers/nvme/host/fc.c b/drivers/nvme/host/fc.c
index 9f9a3b35dc64d3..b4540fb31a0d0b 100644
--- a/drivers/nvme/host/fc.c
+++ b/drivers/nvme/host/fc.c
@@ -3112,6 +3112,9 @@ nvme_fc_create_association(struct nvme_fc_ctrl *ctrl)
 
 	set_bit(NVME_FC_Q_LIVE, &ctrl->queues[0].flags);
 
+	set_bit(NVME_CTRL_VIRT_BOUNDARY_IO, &ctrl->ctrl.flags);
+	set_bit(NVME_CTRL_VIRT_BOUNDARY_ADMIN, &ctrl->ctrl.flags);
+
 	/*
 	 * Check controller capabilities
 	 *
diff --git a/drivers/nvme/host/nvme.h b/drivers/nvme/host/nvme.h
index 3dbd187896d8d8..dac000291a09ae 100644
--- a/drivers/nvme/host/nvme.h
+++ b/drivers/nvme/host/nvme.h
@@ -252,6 +252,10 @@ enum nvme_ctrl_flags {
 	NVME_CTRL_STOPPED		= 3,
 	NVME_CTRL_SKIP_ID_CNS_CS	= 4,
 	NVME_CTRL_DIRTY_CAPABILITY	= 5,
+	/* set the virt_boundary on the admin queue */
+	NVME_CTRL_VIRT_BOUNDARY_ADMIN	= 6,
+	/* set the virt_boundary on the I/O queues */
+	NVME_CTRL_VIRT_BOUNDARY_IO	= 7,
 };
 
 struct nvme_ctrl {
diff --git a/drivers/nvme/host/pci.c b/drivers/nvme/host/pci.c
index 507bc149046dc8..6be2e827c426ba 100644
--- a/drivers/nvme/host/pci.c
+++ b/drivers/nvme/host/pci.c
@@ -60,8 +60,7 @@ MODULE_PARM_DESC(max_host_mem_size_mb,
 static unsigned int sgl_threshold = SZ_32K;
 module_param(sgl_threshold, uint, 0644);
 MODULE_PARM_DESC(sgl_threshold,
-		"Use SGLs when average request segment size is larger or equal to "
-		"this size. Use 0 to disable SGLs.");
+		"Use SGLs when > 0. Use 0 to disable SGLs.");
 
 #define NVME_PCI_MIN_QUEUE_SIZE 2
 #define NVME_PCI_MAX_QUEUE_SIZE 4095
@@ -504,23 +503,6 @@ static void nvme_commit_rqs(struct blk_mq_hw_ctx *hctx)
 	spin_unlock(&nvmeq->sq_lock);
 }
 
-static inline bool nvme_pci_use_sgls(struct nvme_dev *dev, struct request *req,
-				     int nseg)
-{
-	struct nvme_queue *nvmeq = req->mq_hctx->driver_data;
-	unsigned int avg_seg_size;
-
-	avg_seg_size = DIV_ROUND_UP(blk_rq_payload_bytes(req), nseg);
-
-	if (!nvme_ctrl_sgl_supported(&dev->ctrl))
-		return false;
-	if (!nvmeq->qid)
-		return false;
-	if (!sgl_threshold || avg_seg_size < sgl_threshold)
-		return false;
-	return true;
-}
-
 static void nvme_free_prps(struct nvme_dev *dev, struct request *req)
 {
 	const int last_prp = NVME_CTRL_PAGE_SIZE / sizeof(__le64) - 1;
@@ -769,12 +751,20 @@ static blk_status_t nvme_setup_sgl_simple(struct nvme_dev *dev,
 static blk_status_t nvme_map_data(struct nvme_dev *dev, struct request *req,
 		struct nvme_command *cmnd)
 {
+	struct nvme_queue *nvmeq = req->mq_hctx->driver_data;
 	struct nvme_iod *iod = blk_mq_rq_to_pdu(req);
+	bool sgl_supported;
 	blk_status_t ret = BLK_STS_RESOURCE;
 	int rc;
 
+	/*
+	 * Check the NVME_CTRL_VIRT_BOUNDARY_IO flag to see if the controller
+	 * supports SGLs, as the sgl_threshold module paramter can be changed
+	 * at runtime.
+	 */
+	sgl_supported = nvmeq->qid &&
+		test_bit(NVME_CTRL_VIRT_BOUNDARY_IO, &dev->ctrl.flags);
 	if (blk_rq_nr_phys_segments(req) == 1) {
-		struct nvme_queue *nvmeq = req->mq_hctx->driver_data;
 		struct bio_vec bv = req_bvec(req);
 
 		if (!is_pci_p2pdma_page(bv.bv_page)) {
@@ -782,8 +772,7 @@ static blk_status_t nvme_map_data(struct nvme_dev *dev, struct request *req,
 				return nvme_setup_prp_simple(dev, req,
 							     &cmnd->rw, &bv);
 
-			if (nvmeq->qid && sgl_threshold &&
-			    nvme_ctrl_sgl_supported(&dev->ctrl))
+			if (sgl_supported)
 				return nvme_setup_sgl_simple(dev, req,
 							     &cmnd->rw, &bv);
 		}
@@ -806,7 +795,7 @@ static blk_status_t nvme_map_data(struct nvme_dev *dev, struct request *req,
 		goto out_free_sg;
 	}
 
-	if (nvme_pci_use_sgls(dev, req, iod->sgt.nents))
+	if (sgl_supported)
 		ret = nvme_pci_setup_sgls(dev, req, &cmnd->rw);
 	else
 		ret = nvme_pci_setup_prps(dev, req, &cmnd->rw);
@@ -3019,10 +3008,27 @@ static int nvme_probe(struct pci_dev *pdev, const struct pci_device_id *id)
 		goto out_disable;
 	}
 
+	/*
+	 * The admin queue alwyas uses PRPs and thus needs to set the
+	 * virt_boundary to avoid offsets into subsequent PRPs.
+	 */
+	set_bit(NVME_CTRL_VIRT_BOUNDARY_ADMIN, &dev->ctrl.flags);
+
 	result = nvme_init_ctrl_finish(&dev->ctrl, false);
 	if (result)
 		goto out_disable;
 
+	/*
+	 * If the controller supports SGLs we'll always use them for non-trivial
+	 * I/O on the I/O queues unless they were explicitly disabled using the
+	 * module paramater. 
+	 *
+	 * This removes the need for the virt_boundary and thus expensive
+	 * checking in the block layer.
+	 */
+	if (!nvme_ctrl_sgl_supported(&dev->ctrl) || !sgl_threshold)
+		set_bit(NVME_CTRL_VIRT_BOUNDARY_IO, &dev->ctrl.flags);
+
 	nvme_dbbuf_dma_alloc(dev);
 
 	result = nvme_setup_host_mem(dev);
diff --git a/drivers/nvme/host/rdma.c b/drivers/nvme/host/rdma.c
index bc90ec3c51b078..7f017b54cc7fd0 100644
--- a/drivers/nvme/host/rdma.c
+++ b/drivers/nvme/host/rdma.c
@@ -822,6 +822,12 @@ static int nvme_rdma_configure_admin_queue(struct nvme_rdma_ctrl *ctrl,
 	if (error)
 		goto out_remove_admin_tag_set;
 
+	/*
+	 * RDMA MRs don't allow offsets into subsequent pages, just like PRPs.
+	 */
+	set_bit(NVME_CTRL_VIRT_BOUNDARY_IO, &ctrl->ctrl.flags);
+	set_bit(NVME_CTRL_VIRT_BOUNDARY_ADMIN, &ctrl->ctrl.flags);
+
 	error = nvme_enable_ctrl(&ctrl->ctrl);
 	if (error)
 		goto out_stop_queue;
diff --git a/drivers/nvme/host/tcp.c b/drivers/nvme/host/tcp.c
index d79811cfa0ce88..cc29c7899e805a 100644
--- a/drivers/nvme/host/tcp.c
+++ b/drivers/nvme/host/tcp.c
@@ -2097,6 +2097,9 @@ static int nvme_tcp_configure_admin_queue(struct nvme_ctrl *ctrl, bool new)
 	if (error)
 		goto out_cleanup_tagset;
 
+	set_bit(NVME_CTRL_VIRT_BOUNDARY_IO, &ctrl->flags);
+	set_bit(NVME_CTRL_VIRT_BOUNDARY_ADMIN, &ctrl->flags);
+
 	error = nvme_enable_ctrl(ctrl);
 	if (error)
 		goto out_stop_queue;
-- 
2.39.2



^ permalink raw reply related	[flat|nested] 13+ messages in thread

* Re: [PATCH] nvme: don't set a virt_boundary unless needed
  2023-12-21  8:48 [PATCH] nvme: don't set a virt_boundary unless needed Christoph Hellwig
@ 2023-12-21  9:30 ` Sagi Grimberg
  2023-12-21 12:17   ` Christoph Hellwig
  2023-12-22  1:16   ` Max Gurtovoy
  0 siblings, 2 replies; 13+ messages in thread
From: Sagi Grimberg @ 2023-12-21  9:30 UTC (permalink / raw)
  To: Christoph Hellwig, marcan, sven, kbusch, axboe, james.smart
  Cc: alyssa, asahi, linux-nvme, kch


> NVMe PRPs are a pain and force the expensive virt_boundary checking on
> block layer, prevent secure passthrough and require scatter/gather I/O
> to be split into multiple commands which is problematic for the upcoming
> atomic write support.

But is the threshold still correct? meaning for I/Os small enough the
device will have lower performance? I'm not advocating that we keep it,
but we should at least mention the tradeoff in the change log.

> Fix the NVMe core to require an opt-in from the drivers for it.
> 
> For nvme-apple it is always required as the driver only supports PRPs.
> 
> For nvme-pci when SGLs are supported we'll always use them for data I/O
> that would require a virt_boundary.
> 
> For nvme-rdma the virt boundary is always required, as RMDA MRs are just
> as dumb as NVMe PRPs.

That is actually device dependent. The driver can ask for a pool of
mrs with type IB_MR_TYPE_SG_GAPS if the device supports IBK_SG_GAPS_REG.

See from ib_srp.c:
--
        if (device->attrs.kernel_cap_flags & IBK_SG_GAPS_REG)
                 mr_type = IB_MR_TYPE_SG_GAPS;
         else
                 mr_type = IB_MR_TYPE_MEM_REG;
--

> 
> For nvme-tcp and nvme-fc I set the flags for now because I don't
> understand the drivers fully, but I suspect the flags could be lifted.

tcp can absolutely omit virt boundaries.

> For nvme-loop the flag is never set as it doesn't have any requirements
> on the I/O format.
> 
> Signed-off-by: Christoph Hellwig <hch@lst.de>
> ---
>   drivers/nvme/host/apple.c |  6 +++++
>   drivers/nvme/host/core.c  | 11 ++++++++-
>   drivers/nvme/host/fc.c    |  3 +++
>   drivers/nvme/host/nvme.h  |  4 +++
>   drivers/nvme/host/pci.c   | 52 ++++++++++++++++++++++-----------------
>   drivers/nvme/host/rdma.c  |  6 +++++
>   drivers/nvme/host/tcp.c   |  3 +++
>   7 files changed, 61 insertions(+), 24 deletions(-)
> 
> diff --git a/drivers/nvme/host/apple.c b/drivers/nvme/host/apple.c
> index 596bb11eeba5a9..a1afb54e3b4da8 100644
> --- a/drivers/nvme/host/apple.c
> +++ b/drivers/nvme/host/apple.c
> @@ -1116,6 +1116,12 @@ static void apple_nvme_reset_work(struct work_struct *work)
>   		goto out;
>   	}
>   
> +	/*
> +	 * nvme-apple always uses PRPs and thus needs to set a virt boundary.
> +	 */
> +	set_bit(NVME_CTRL_VIRT_BOUNDARY_IO, &anv->ctrl.flags);
> +	set_bit(NVME_CTRL_VIRT_BOUNDARY_ADMIN, &anv->ctrl.flags);
> +

Why two flags? Why can't the core just always set the blk virt boundary
on the admin request queue?

>   	ret = nvme_init_ctrl_finish(&anv->ctrl, false);
>   	if (ret)
>   		goto out


^ permalink raw reply	[flat|nested] 13+ messages in thread

* Re: [PATCH] nvme: don't set a virt_boundary unless needed
  2023-12-21  9:30 ` Sagi Grimberg
@ 2023-12-21 12:17   ` Christoph Hellwig
  2023-12-21 12:32     ` Sagi Grimberg
  2023-12-21 17:03     ` Keith Busch
  2023-12-22  1:16   ` Max Gurtovoy
  1 sibling, 2 replies; 13+ messages in thread
From: Christoph Hellwig @ 2023-12-21 12:17 UTC (permalink / raw)
  To: Sagi Grimberg
  Cc: Christoph Hellwig, marcan, sven, kbusch, axboe, james.smart,
	alyssa, asahi, linux-nvme, kch

On Thu, Dec 21, 2023 at 11:30:38AM +0200, Sagi Grimberg wrote:
>
>> NVMe PRPs are a pain and force the expensive virt_boundary checking on
>> block layer, prevent secure passthrough and require scatter/gather I/O
>> to be split into multiple commands which is problematic for the upcoming
>> atomic write support.
>
> But is the threshold still correct? meaning for I/Os small enough the
> device will have lower performance? I'm not advocating that we keep it,
> but we should at least mention the tradeoff in the change log.

Chaitanya benchmarked it on the first generation of devices that
supported SGLs.  On the only SGL-enabled device I have there is no
performance penality for using SGLs on small transfer, but I'd love
to see numbers from other setups.

>> For nvme-rdma the virt boundary is always required, as RMDA MRs are just
>> as dumb as NVMe PRPs.
>
> That is actually device dependent. The driver can ask for a pool of
> mrs with type IB_MR_TYPE_SG_GAPS if the device supports IBK_SG_GAPS_REG.
>
> See from ib_srp.c:
> --
>        if (device->attrs.kernel_cap_flags & IBK_SG_GAPS_REG)
>                 mr_type = IB_MR_TYPE_SG_GAPS;
>         else
>                 mr_type = IB_MR_TYPE_MEM_REG;
> --

For that we'd need to support IB_MR_TYPE_SG_GAPS gaps first, which can
be done as an incremental improvement.

>>   +	/*
>> +	 * nvme-apple always uses PRPs and thus needs to set a virt boundary.
>> +	 */
>> +	set_bit(NVME_CTRL_VIRT_BOUNDARY_IO, &anv->ctrl.flags);
>> +	set_bit(NVME_CTRL_VIRT_BOUNDARY_ADMIN, &anv->ctrl.flags);
>> +
>
> Why two flags? Why can't the core just always set the blk virt boundary
> on the admin request queue?

It could, and given that the admin queue isn't performance critical it
probably won't hurt in reality.  But why enforce a really weird limit
on the queue if there is no reason for it?  The only transport that
treats the admin queue different is PCIe, and that's just a spec
oddity.


^ permalink raw reply	[flat|nested] 13+ messages in thread

* Re: [PATCH] nvme: don't set a virt_boundary unless needed
  2023-12-21 12:17   ` Christoph Hellwig
@ 2023-12-21 12:32     ` Sagi Grimberg
  2023-12-21 12:40       ` Christoph Hellwig
  2023-12-21 17:03     ` Keith Busch
  1 sibling, 1 reply; 13+ messages in thread
From: Sagi Grimberg @ 2023-12-21 12:32 UTC (permalink / raw)
  To: Christoph Hellwig
  Cc: marcan, sven, kbusch, axboe, james.smart, alyssa, asahi,
	linux-nvme, kch



On 12/21/23 14:17, Christoph Hellwig wrote:
> On Thu, Dec 21, 2023 at 11:30:38AM +0200, Sagi Grimberg wrote:
>>
>>> NVMe PRPs are a pain and force the expensive virt_boundary checking on
>>> block layer, prevent secure passthrough and require scatter/gather I/O
>>> to be split into multiple commands which is problematic for the upcoming
>>> atomic write support.
>>
>> But is the threshold still correct? meaning for I/Os small enough the
>> device will have lower performance? I'm not advocating that we keep it,
>> but we should at least mention the tradeoff in the change log.
> 
> Chaitanya benchmarked it on the first generation of devices that
> supported SGLs.  On the only SGL-enabled device I have there is no
> performance penality for using SGLs on small transfer, but I'd love
> to see numbers from other setups.

Someone that cares should speak up I assume.
Still the change log should indicate that there is no threshold for
sgl/prp anymore.

>>> For nvme-rdma the virt boundary is always required, as RMDA MRs are just
>>> as dumb as NVMe PRPs.
>>
>> That is actually device dependent. The driver can ask for a pool of
>> mrs with type IB_MR_TYPE_SG_GAPS if the device supports IBK_SG_GAPS_REG.
>>
>> See from ib_srp.c:
>> --
>>         if (device->attrs.kernel_cap_flags & IBK_SG_GAPS_REG)
>>                  mr_type = IB_MR_TYPE_SG_GAPS;
>>          else
>>                  mr_type = IB_MR_TYPE_MEM_REG;
>> --
> 
> For that we'd need to support IB_MR_TYPE_SG_GAPS gaps first, which can
> be done as an incremental improvement.

You actually don't need anything afair, you just add this code and
stop setting the virt boundary.

>>>    +	/*
>>> +	 * nvme-apple always uses PRPs and thus needs to set a virt boundary.
>>> +	 */
>>> +	set_bit(NVME_CTRL_VIRT_BOUNDARY_IO, &anv->ctrl.flags);
>>> +	set_bit(NVME_CTRL_VIRT_BOUNDARY_ADMIN, &anv->ctrl.flags);
>>> +
>>
>> Why two flags? Why can't the core just always set the blk virt boundary
>> on the admin request queue?
> 
> It could, and given that the admin queue isn't performance critical it
> probably won't hurt in reality.  But why enforce a really weird limit
> on the queue if there is no reason for it?  The only transport that
> treats the admin queue different is PCIe, and that's just a spec
> oddity.

Exactly because its odd. Unless there is any benefit of using sgls in
admin commands lets not flag it per transport.


^ permalink raw reply	[flat|nested] 13+ messages in thread

* Re: [PATCH] nvme: don't set a virt_boundary unless needed
  2023-12-21 12:32     ` Sagi Grimberg
@ 2023-12-21 12:40       ` Christoph Hellwig
  2023-12-25  9:13         ` Sagi Grimberg
  0 siblings, 1 reply; 13+ messages in thread
From: Christoph Hellwig @ 2023-12-21 12:40 UTC (permalink / raw)
  To: Sagi Grimberg
  Cc: Christoph Hellwig, marcan, sven, kbusch, axboe, james.smart,
	alyssa, asahi, linux-nvme, kch

On Thu, Dec 21, 2023 at 02:32:35PM +0200, Sagi Grimberg wrote:
> Exactly because its odd. Unless there is any benefit of using sgls in
> admin commands lets not flag it per transport.

The other transports always and unconditionally use SGLs anyway.  With
the virt boundary we're just adding extra checks to fail certain
passthrough admin commands (the kernel will never generate those cases).


^ permalink raw reply	[flat|nested] 13+ messages in thread

* Re: [PATCH] nvme: don't set a virt_boundary unless needed
  2023-12-21 12:17   ` Christoph Hellwig
  2023-12-21 12:32     ` Sagi Grimberg
@ 2023-12-21 17:03     ` Keith Busch
  2023-12-25  9:20       ` Sagi Grimberg
  1 sibling, 1 reply; 13+ messages in thread
From: Keith Busch @ 2023-12-21 17:03 UTC (permalink / raw)
  To: Christoph Hellwig
  Cc: Sagi Grimberg, marcan, sven, axboe, james.smart, alyssa, asahi,
	linux-nvme, kch

On Thu, Dec 21, 2023 at 01:17:46PM +0100, Christoph Hellwig wrote:
> On Thu, Dec 21, 2023 at 11:30:38AM +0200, Sagi Grimberg wrote:
> >
> >> NVMe PRPs are a pain and force the expensive virt_boundary checking on
> >> block layer, prevent secure passthrough and require scatter/gather I/O
> >> to be split into multiple commands which is problematic for the upcoming
> >> atomic write support.
> >
> > But is the threshold still correct? meaning for I/Os small enough the
> > device will have lower performance? I'm not advocating that we keep it,
> > but we should at least mention the tradeoff in the change log.
> 
> Chaitanya benchmarked it on the first generation of devices that
> supported SGLs.  On the only SGL-enabled device I have there is no
> performance penality for using SGLs on small transfer, but I'd love
> to see numbers from other setups.

It's the larger transfers where it gets worse. To exaggerate the
difference, consider send a 2MB write with virtually aligned but
discontiguous user buffer: 512 folios.

PRP fits in 1 prp_page_pool block.

SGL needs 3 prp_page_pool blocks, tripling the command's memory usage.


^ permalink raw reply	[flat|nested] 13+ messages in thread

* Re: [PATCH] nvme: don't set a virt_boundary unless needed
  2023-12-21  9:30 ` Sagi Grimberg
  2023-12-21 12:17   ` Christoph Hellwig
@ 2023-12-22  1:16   ` Max Gurtovoy
  2023-12-25 10:08     ` Sagi Grimberg
  1 sibling, 1 reply; 13+ messages in thread
From: Max Gurtovoy @ 2023-12-22  1:16 UTC (permalink / raw)
  To: linux-nvme, Christoph Hellwig, marcan, sven, Keith Busch,
	Jens Axboe, James Smart
  Cc: alyssa, asahi, Chaitanya Kulkarni

On 21/12/2023 11:30, Sagi Grimberg wrote:
> 
>> NVMe PRPs are a pain and force the expensive virt_boundary checking on
>> block layer, prevent secure passthrough and require scatter/gather I/O
>> to be split into multiple commands which is problematic for the upcoming
>> atomic write support.
> 
> But is the threshold still correct? meaning for I/Os small enough the
> device will have lower performance? I'm not advocating that we keep it,
> but we should at least mention the tradeoff in the change log.
> 
>> Fix the NVMe core to require an opt-in from the drivers for it.
>>
>> For nvme-apple it is always required as the driver only supports PRPs.
>>
>> For nvme-pci when SGLs are supported we'll always use them for data I/O
>> that would require a virt_boundary.
>>
>> For nvme-rdma the virt boundary is always required, as RMDA MRs are just
>> as dumb as NVMe PRPs.
> 
> That is actually device dependent. The driver can ask for a pool of
> mrs with type IB_MR_TYPE_SG_GAPS if the device supports IBK_SG_GAPS_REG.
> 
> See from ib_srp.c:
> -- 
>         if (device->attrs.kernel_cap_flags & IBK_SG_GAPS_REG)
>                  mr_type = IB_MR_TYPE_SG_GAPS;
>          else
>                  mr_type = IB_MR_TYPE_MEM_REG;

For now, I prefer not using the IB_MR_TYPE_SG_GAPS MR in NVMe/RDMA since 
in the case of virtual contiguous data buffers it is better to use 
IB_MR_TYPE_MEM_REG. It gives much better performance. This is the reason 
I didn't add IB_MR_TYPE_SG_GAPS MR support for NVMe/RDMA.

I actually had a plan to re-write the IB_MR_TYPE_SG_GAPS MR logic (or 
create a new MR type) that will internally open 2 MRs so if the IO is 
contiguous it will use the MTT/MEM_REG and if it isn't it will use the 
KLM/SG_GAPS.
This is how we implemented the SIG_MR but still didn't make it for the 
IB_MR_TYPE_SG_GAPS MR.

Actually, I think we should have the same logic in the NVMe PCI driver:
if the IOs can be delivered as PRPs then the driver will prepare SQE 
with PRP. Otherwise, driver will prepare SGL.
I think that doing the check in the driver for each IO is not so bad and 
devices will get benefit from it. Usually HW devices like to work with 
contiguous buffers. If the buffers can't be mapped with PRPs, then the 
HW will work a bit harder and use SGLs (it is better than doing a bounce 
buffer in the block layer).

I actually did a POC internally for NVMe/RDMA and created sg_gaps ib_mr 
and mem_reg ib_mr and checked the buffers mapping for each IO and got a 
big benefit if the buffers were discontig (used the sg_gaps mr). Also 
the contig buffers performance didn't degraded because of the check of 
the buffers mapping.

I created a fio flags that in purpose sends discontig IOs for my testing.

WDYT ?

> -- 
> 
>>
>> For nvme-tcp and nvme-fc I set the flags for now because I don't
>> understand the drivers fully, but I suspect the flags could be lifted.
> 
> tcp can absolutely omit virt boundaries.
> 
>> For nvme-loop the flag is never set as it doesn't have any requirements
>> on the I/O format.
>>
>> Signed-off-by: Christoph Hellwig <hch@lst.de>
>> ---
>>   drivers/nvme/host/apple.c |  6 +++++
>>   drivers/nvme/host/core.c  | 11 ++++++++-
>>   drivers/nvme/host/fc.c    |  3 +++
>>   drivers/nvme/host/nvme.h  |  4 +++
>>   drivers/nvme/host/pci.c   | 52 ++++++++++++++++++++++-----------------
>>   drivers/nvme/host/rdma.c  |  6 +++++
>>   drivers/nvme/host/tcp.c   |  3 +++
>>   7 files changed, 61 insertions(+), 24 deletions(-)
>>
>> diff --git a/drivers/nvme/host/apple.c b/drivers/nvme/host/apple.c
>> index 596bb11eeba5a9..a1afb54e3b4da8 100644
>> --- a/drivers/nvme/host/apple.c
>> +++ b/drivers/nvme/host/apple.c
>> @@ -1116,6 +1116,12 @@ static void apple_nvme_reset_work(struct 
>> work_struct *work)
>>           goto out;
>>       }
>> +    /*
>> +     * nvme-apple always uses PRPs and thus needs to set a virt 
>> boundary.
>> +     */
>> +    set_bit(NVME_CTRL_VIRT_BOUNDARY_IO, &anv->ctrl.flags);
>> +    set_bit(NVME_CTRL_VIRT_BOUNDARY_ADMIN, &anv->ctrl.flags);
>> +
> 
> Why two flags? Why can't the core just always set the blk virt boundary
> on the admin request queue?

I also think that a single flag is good enough.
> 
>>       ret = nvme_init_ctrl_finish(&anv->ctrl, false);
>>       if (ret)
>>           goto out
> 

^ permalink raw reply	[flat|nested] 13+ messages in thread

* Re: [PATCH] nvme: don't set a virt_boundary unless needed
  2023-12-21 12:40       ` Christoph Hellwig
@ 2023-12-25  9:13         ` Sagi Grimberg
  0 siblings, 0 replies; 13+ messages in thread
From: Sagi Grimberg @ 2023-12-25  9:13 UTC (permalink / raw)
  To: Christoph Hellwig
  Cc: marcan, sven, kbusch, axboe, james.smart, alyssa, asahi,
	linux-nvme, kch


>> Exactly because its odd. Unless there is any benefit of using sgls in
>> admin commands lets not flag it per transport.
> 
> The other transports always and unconditionally use SGLs anyway.  With
> the virt boundary we're just adding extra checks to fail certain
> passthrough admin commands (the kernel will never generate those cases).

I just don't see the point of adding a per-transport flag. Either we
always enforce admin queue virt_boundary in the core because its not
important enough, or we always allow sgls and have pci set this flag
locally (given that only it has this quirk).


^ permalink raw reply	[flat|nested] 13+ messages in thread

* Re: [PATCH] nvme: don't set a virt_boundary unless needed
  2023-12-21 17:03     ` Keith Busch
@ 2023-12-25  9:20       ` Sagi Grimberg
  0 siblings, 0 replies; 13+ messages in thread
From: Sagi Grimberg @ 2023-12-25  9:20 UTC (permalink / raw)
  To: Keith Busch, Christoph Hellwig
  Cc: marcan, sven, axboe, james.smart, alyssa, asahi, linux-nvme, kch


>>>> NVMe PRPs are a pain and force the expensive virt_boundary checking on
>>>> block layer, prevent secure passthrough and require scatter/gather I/O
>>>> to be split into multiple commands which is problematic for the upcoming
>>>> atomic write support.
>>>
>>> But is the threshold still correct? meaning for I/Os small enough the
>>> device will have lower performance? I'm not advocating that we keep it,
>>> but we should at least mention the tradeoff in the change log.
>>
>> Chaitanya benchmarked it on the first generation of devices that
>> supported SGLs.  On the only SGL-enabled device I have there is no
>> performance penality for using SGLs on small transfer, but I'd love
>> to see numbers from other setups.
> 
> It's the larger transfers where it gets worse. To exaggerate the
> difference, consider send a 2MB write with virtually aligned but
> discontiguous user buffer: 512 folios.
> 
> PRP fits in 1 prp_page_pool block.
> 
> SGL needs 3 prp_page_pool blocks, tripling the command's memory usage.

My assumption was that in large transfers the memory overhead is
negligible and the controller dma streaming dominates the performance.
The threshold was for minimum transfer to use sgls.

In any event, I think that we are talking about theoretical gains/losses
here. Unless anyone shows a real loss here, we should go with the
simplicity.

Theoretically the driver could add incremental optimizations if the
request buffer can use either prp/sgl, but this is something that
should be explored with real device(s) performance measurements I think.


^ permalink raw reply	[flat|nested] 13+ messages in thread

* Re: [PATCH] nvme: don't set a virt_boundary unless needed
  2023-12-22  1:16   ` Max Gurtovoy
@ 2023-12-25 10:08     ` Sagi Grimberg
  2023-12-25 10:36       ` Max Gurtovoy
  0 siblings, 1 reply; 13+ messages in thread
From: Sagi Grimberg @ 2023-12-25 10:08 UTC (permalink / raw)
  To: Max Gurtovoy, linux-nvme, Christoph Hellwig, marcan, sven,
	Keith Busch, Jens Axboe, James Smart
  Cc: alyssa, asahi, Chaitanya Kulkarni



On 12/22/23 03:16, Max Gurtovoy wrote:
> 
> 
> On 21/12/2023 11:30, Sagi Grimberg wrote:
>>
>>> NVMe PRPs are a pain and force the expensive virt_boundary checking on
>>> block layer, prevent secure passthrough and require scatter/gather I/O
>>> to be split into multiple commands which is problematic for the upcoming
>>> atomic write support.
>>
>> But is the threshold still correct? meaning for I/Os small enough the
>> device will have lower performance? I'm not advocating that we keep it,
>> but we should at least mention the tradeoff in the change log.
>>
>>> Fix the NVMe core to require an opt-in from the drivers for it.
>>>
>>> For nvme-apple it is always required as the driver only supports PRPs.
>>>
>>> For nvme-pci when SGLs are supported we'll always use them for data I/O
>>> that would require a virt_boundary.
>>>
>>> For nvme-rdma the virt boundary is always required, as RMDA MRs are just
>>> as dumb as NVMe PRPs.
>>
>> That is actually device dependent. The driver can ask for a pool of
>> mrs with type IB_MR_TYPE_SG_GAPS if the device supports IBK_SG_GAPS_REG.
>>
>> See from ib_srp.c:
>> -- 
>>         if (device->attrs.kernel_cap_flags & IBK_SG_GAPS_REG)
>>                  mr_type = IB_MR_TYPE_SG_GAPS;
>>          else
>>                  mr_type = IB_MR_TYPE_MEM_REG;
> 
> For now, I prefer not using the IB_MR_TYPE_SG_GAPS MR in NVMe/RDMA since 
> in the case of virtual contiguous data buffers it is better to use 
> IB_MR_TYPE_MEM_REG. It gives much better performance. This is the reason 
> I didn't add IB_MR_TYPE_SG_GAPS MR support for NVMe/RDMA.

I see. I guess it is not *that* trivial then.

> I actually had a plan to re-write the IB_MR_TYPE_SG_GAPS MR logic (or 
> create a new MR type) that will internally open 2 MRs so if the IO is 
> contiguous it will use the MTT/MEM_REG and if it isn't it will use the 
> KLM/SG_GAPS.
> This is how we implemented the SIG_MR but still didn't make it for the 
> IB_MR_TYPE_SG_GAPS MR.

Sounds like a reasonable option. But doesn't think mean that the
driver will need to scan the page scatterlist to determine what internal
mr to use? Even a fallback mechanism can be affected by a given
workload. Plus there is the cost of doubling the number of preallocated
mrs.

> Actually, I think we should have the same logic in the NVMe PCI driver:
> if the IOs can be delivered as PRPs then the driver will prepare SQE 
> with PRP. Otherwise, driver will prepare SGL.
> I think that doing the check in the driver for each IO is not so bad and 
> devices will get benefit from it. Usually HW devices like to work with 
> contiguous buffers. If the buffers can't be mapped with PRPs, then the 
> HW will work a bit harder and use SGLs (it is better than doing a bounce 
> buffer in the block layer).
> 
> I actually did a POC internally for NVMe/RDMA and created sg_gaps ib_mr 
> and mem_reg ib_mr and checked the buffers mapping for each IO and got a 
> big benefit if the buffers were discontig (used the sg_gaps mr). Also 
> the contig buffers performance didn't degraded because of the check of 
> the buffers mapping.
> 
> I created a fio flags that in purpose sends discontig IOs for my testing.
> 
> WDYT ?

Sounds possible. However for rdma we probably want this transparent to
the ulp such that all consumers can have this benefit. Also perhaps add
this logic in the rdma core so other drivers can use it as well
(although I don't know if any other rdma driver supports sg gaps
anyways).

If this proves to be a good approach, pci can do something similar.


^ permalink raw reply	[flat|nested] 13+ messages in thread

* Re: [PATCH] nvme: don't set a virt_boundary unless needed
  2023-12-25 10:08     ` Sagi Grimberg
@ 2023-12-25 10:36       ` Max Gurtovoy
  2023-12-25 10:44         ` Sagi Grimberg
  0 siblings, 1 reply; 13+ messages in thread
From: Max Gurtovoy @ 2023-12-25 10:36 UTC (permalink / raw)
  To: Sagi Grimberg, linux-nvme, Christoph Hellwig, marcan, sven,
	Keith Busch, Jens Axboe, James Smart
  Cc: alyssa, asahi, Chaitanya Kulkarni



On 25/12/2023 12:08, Sagi Grimberg wrote:
> 
> 
> On 12/22/23 03:16, Max Gurtovoy wrote:
>>
>>
>> On 21/12/2023 11:30, Sagi Grimberg wrote:
>>>
>>>> NVMe PRPs are a pain and force the expensive virt_boundary checking on
>>>> block layer, prevent secure passthrough and require scatter/gather I/O
>>>> to be split into multiple commands which is problematic for the 
>>>> upcoming
>>>> atomic write support.
>>>
>>> But is the threshold still correct? meaning for I/Os small enough the
>>> device will have lower performance? I'm not advocating that we keep it,
>>> but we should at least mention the tradeoff in the change log.
>>>
>>>> Fix the NVMe core to require an opt-in from the drivers for it.
>>>>
>>>> For nvme-apple it is always required as the driver only supports PRPs.
>>>>
>>>> For nvme-pci when SGLs are supported we'll always use them for data I/O
>>>> that would require a virt_boundary.
>>>>
>>>> For nvme-rdma the virt boundary is always required, as RMDA MRs are 
>>>> just
>>>> as dumb as NVMe PRPs.
>>>
>>> That is actually device dependent. The driver can ask for a pool of
>>> mrs with type IB_MR_TYPE_SG_GAPS if the device supports IBK_SG_GAPS_REG.
>>>
>>> See from ib_srp.c:
>>> -- 
>>>         if (device->attrs.kernel_cap_flags & IBK_SG_GAPS_REG)
>>>                  mr_type = IB_MR_TYPE_SG_GAPS;
>>>          else
>>>                  mr_type = IB_MR_TYPE_MEM_REG;
>>
>> For now, I prefer not using the IB_MR_TYPE_SG_GAPS MR in NVMe/RDMA 
>> since in the case of virtual contiguous data buffers it is better to 
>> use IB_MR_TYPE_MEM_REG. It gives much better performance. This is the 
>> reason I didn't add IB_MR_TYPE_SG_GAPS MR support for NVMe/RDMA.
> 
> I see. I guess it is not *that* trivial then.
> 
>> I actually had a plan to re-write the IB_MR_TYPE_SG_GAPS MR logic (or 
>> create a new MR type) that will internally open 2 MRs so if the IO is 
>> contiguous it will use the MTT/MEM_REG and if it isn't it will use the 
>> KLM/SG_GAPS.
>> This is how we implemented the SIG_MR but still didn't make it for the 
>> IB_MR_TYPE_SG_GAPS MR.
> 
> Sounds like a reasonable option. But doesn't think mean that the
> driver will need to scan the page scatterlist to determine what internal
> mr to use? Even a fallback mechanism can be affected by a given
> workload. Plus there is the cost of doubling the number of preallocated
> mrs.
> 

Scanning the scatterlist is done anyway for mapping purposes so I don't 
think it will affect the performance.
The cost of doubling the number of MRs is the what we need to pay to get 
optimal performance for contig and discontig IOs, I guess..

>> Actually, I think we should have the same logic in the NVMe PCI driver:
>> if the IOs can be delivered as PRPs then the driver will prepare SQE 
>> with PRP. Otherwise, driver will prepare SGL.
>> I think that doing the check in the driver for each IO is not so bad 
>> and devices will get benefit from it. Usually HW devices like to work 
>> with contiguous buffers. If the buffers can't be mapped with PRPs, 
>> then the HW will work a bit harder and use SGLs (it is better than 
>> doing a bounce buffer in the block layer).
>>
>> I actually did a POC internally for NVMe/RDMA and created sg_gaps 
>> ib_mr and mem_reg ib_mr and checked the buffers mapping for each IO 
>> and got a big benefit if the buffers were discontig (used the sg_gaps 
>> mr). Also the contig buffers performance didn't degraded because of 
>> the check of the buffers mapping.
>>
>> I created a fio flags that in purpose sends discontig IOs for my testing.
>>
>> WDYT ?
> 
> Sounds possible. However for rdma we probably want this transparent to
> the ulp such that all consumers can have this benefit. Also perhaps add
> this logic in the rdma core so other drivers can use it as well
> (although I don't know if any other rdma driver supports sg gaps
> anyways).
> 
> If this proves to be a good approach, pci can do something similar.

For RDMA, I plan to do it in the device driver (mlx5) layer and not the 
ib_core layer. It is unique to our implementation.

For the NVMe PCI case, I suggested doing it unrelated to the NVMe/RDMA 
solution. The NVMe/PCI is actually the device driver of the PCI device 
and the scanning of the scatterlist should happen in the device driver.
I suggest to try this solution since we always debating about thresholds 
and when to use SGLs.
Now that Christoph opens the gate for the driver to work with discontig 
IOs I believe that for *any* discontig IO we should use SGLs and for 
*any* contig IO we should use PRPs.

NVMe SSD vendors will be able to test this approach and report their 
numbers.


^ permalink raw reply	[flat|nested] 13+ messages in thread

* Re: [PATCH] nvme: don't set a virt_boundary unless needed
  2023-12-25 10:36       ` Max Gurtovoy
@ 2023-12-25 10:44         ` Sagi Grimberg
  2023-12-25 12:31           ` Max Gurtovoy
  0 siblings, 1 reply; 13+ messages in thread
From: Sagi Grimberg @ 2023-12-25 10:44 UTC (permalink / raw)
  To: Max Gurtovoy, linux-nvme, Christoph Hellwig, marcan, sven,
	Keith Busch, Jens Axboe, James Smart
  Cc: alyssa, asahi, Chaitanya Kulkarni



On 12/25/23 12:36, Max Gurtovoy wrote:
> 
> 
> On 25/12/2023 12:08, Sagi Grimberg wrote:
>>
>>
>> On 12/22/23 03:16, Max Gurtovoy wrote:
>>>
>>>
>>> On 21/12/2023 11:30, Sagi Grimberg wrote:
>>>>
>>>>> NVMe PRPs are a pain and force the expensive virt_boundary checking on
>>>>> block layer, prevent secure passthrough and require scatter/gather I/O
>>>>> to be split into multiple commands which is problematic for the 
>>>>> upcoming
>>>>> atomic write support.
>>>>
>>>> But is the threshold still correct? meaning for I/Os small enough the
>>>> device will have lower performance? I'm not advocating that we keep it,
>>>> but we should at least mention the tradeoff in the change log.
>>>>
>>>>> Fix the NVMe core to require an opt-in from the drivers for it.
>>>>>
>>>>> For nvme-apple it is always required as the driver only supports PRPs.
>>>>>
>>>>> For nvme-pci when SGLs are supported we'll always use them for data 
>>>>> I/O
>>>>> that would require a virt_boundary.
>>>>>
>>>>> For nvme-rdma the virt boundary is always required, as RMDA MRs are 
>>>>> just
>>>>> as dumb as NVMe PRPs.
>>>>
>>>> That is actually device dependent. The driver can ask for a pool of
>>>> mrs with type IB_MR_TYPE_SG_GAPS if the device supports 
>>>> IBK_SG_GAPS_REG.
>>>>
>>>> See from ib_srp.c:
>>>> -- 
>>>>         if (device->attrs.kernel_cap_flags & IBK_SG_GAPS_REG)
>>>>                  mr_type = IB_MR_TYPE_SG_GAPS;
>>>>          else
>>>>                  mr_type = IB_MR_TYPE_MEM_REG;
>>>
>>> For now, I prefer not using the IB_MR_TYPE_SG_GAPS MR in NVMe/RDMA 
>>> since in the case of virtual contiguous data buffers it is better to 
>>> use IB_MR_TYPE_MEM_REG. It gives much better performance. This is the 
>>> reason I didn't add IB_MR_TYPE_SG_GAPS MR support for NVMe/RDMA.
>>
>> I see. I guess it is not *that* trivial then.
>>
>>> I actually had a plan to re-write the IB_MR_TYPE_SG_GAPS MR logic (or 
>>> create a new MR type) that will internally open 2 MRs so if the IO is 
>>> contiguous it will use the MTT/MEM_REG and if it isn't it will use 
>>> the KLM/SG_GAPS.
>>> This is how we implemented the SIG_MR but still didn't make it for 
>>> the IB_MR_TYPE_SG_GAPS MR.
>>
>> Sounds like a reasonable option. But doesn't think mean that the
>> driver will need to scan the page scatterlist to determine what internal
>> mr to use? Even a fallback mechanism can be affected by a given
>> workload. Plus there is the cost of doubling the number of preallocated
>> mrs.
>>
> 
> Scanning the scatterlist is done anyway for mapping purposes so I don't 
> think it will affect the performance.
> The cost of doubling the number of MRs is the what we need to pay to get 
> optimal performance for contig and discontig IOs, I guess..
> 
>>> Actually, I think we should have the same logic in the NVMe PCI driver:
>>> if the IOs can be delivered as PRPs then the driver will prepare SQE 
>>> with PRP. Otherwise, driver will prepare SGL.
>>> I think that doing the check in the driver for each IO is not so bad 
>>> and devices will get benefit from it. Usually HW devices like to work 
>>> with contiguous buffers. If the buffers can't be mapped with PRPs, 
>>> then the HW will work a bit harder and use SGLs (it is better than 
>>> doing a bounce buffer in the block layer).
>>>
>>> I actually did a POC internally for NVMe/RDMA and created sg_gaps 
>>> ib_mr and mem_reg ib_mr and checked the buffers mapping for each IO 
>>> and got a big benefit if the buffers were discontig (used the sg_gaps 
>>> mr). Also the contig buffers performance didn't degraded because of 
>>> the check of the buffers mapping.
>>>
>>> I created a fio flags that in purpose sends discontig IOs for my 
>>> testing.
>>>
>>> WDYT ?
>>
>> Sounds possible. However for rdma we probably want this transparent to
>> the ulp such that all consumers can have this benefit. Also perhaps add
>> this logic in the rdma core so other drivers can use it as well
>> (although I don't know if any other rdma driver supports sg gaps
>> anyways).
>>
>> If this proves to be a good approach, pci can do something similar.
> 
> For RDMA, I plan to do it in the device driver (mlx5) layer and not the 
> ib_core layer. It is unique to our implementation.

Well, SG_GAPS is not intended to be a unique capability (although it is
today in practice I guess).

> 
> For the NVMe PCI case, I suggested doing it unrelated to the NVMe/RDMA 
> solution. The NVMe/PCI is actually the device driver of the PCI device 
> and the scanning of the scatterlist should happen in the device driver.
> I suggest to try this solution since we always debating about thresholds 
> and when to use SGLs.
> Now that Christoph opens the gate for the driver to work with discontig 
> IOs I believe that for *any* discontig IO we should use SGLs and for 
> *any* contig IO we should use PRPs.

Why *any* contig IO? There are certainly cases where sgls would perform
better with sgls than prps I'd assume... For example if a large buffer
is physically contiguous (say a huge page)?


^ permalink raw reply	[flat|nested] 13+ messages in thread

* Re: [PATCH] nvme: don't set a virt_boundary unless needed
  2023-12-25 10:44         ` Sagi Grimberg
@ 2023-12-25 12:31           ` Max Gurtovoy
  0 siblings, 0 replies; 13+ messages in thread
From: Max Gurtovoy @ 2023-12-25 12:31 UTC (permalink / raw)
  To: Sagi Grimberg, linux-nvme, Christoph Hellwig, marcan, sven,
	Keith Busch, Jens Axboe, James Smart
  Cc: alyssa, asahi, Chaitanya Kulkarni



On 25/12/2023 12:44, Sagi Grimberg wrote:
> 
> 
> On 12/25/23 12:36, Max Gurtovoy wrote:
>>
>>
>> On 25/12/2023 12:08, Sagi Grimberg wrote:
>>>
>>>
>>> On 12/22/23 03:16, Max Gurtovoy wrote:
>>>>
>>>>
>>>> On 21/12/2023 11:30, Sagi Grimberg wrote:
>>>>>
>>>>>> NVMe PRPs are a pain and force the expensive virt_boundary 
>>>>>> checking on
>>>>>> block layer, prevent secure passthrough and require scatter/gather 
>>>>>> I/O
>>>>>> to be split into multiple commands which is problematic for the 
>>>>>> upcoming
>>>>>> atomic write support.
>>>>>
>>>>> But is the threshold still correct? meaning for I/Os small enough the
>>>>> device will have lower performance? I'm not advocating that we keep 
>>>>> it,
>>>>> but we should at least mention the tradeoff in the change log.
>>>>>
>>>>>> Fix the NVMe core to require an opt-in from the drivers for it.
>>>>>>
>>>>>> For nvme-apple it is always required as the driver only supports 
>>>>>> PRPs.
>>>>>>
>>>>>> For nvme-pci when SGLs are supported we'll always use them for 
>>>>>> data I/O
>>>>>> that would require a virt_boundary.
>>>>>>
>>>>>> For nvme-rdma the virt boundary is always required, as RMDA MRs 
>>>>>> are just
>>>>>> as dumb as NVMe PRPs.
>>>>>
>>>>> That is actually device dependent. The driver can ask for a pool of
>>>>> mrs with type IB_MR_TYPE_SG_GAPS if the device supports 
>>>>> IBK_SG_GAPS_REG.
>>>>>
>>>>> See from ib_srp.c:
>>>>> -- 
>>>>>         if (device->attrs.kernel_cap_flags & IBK_SG_GAPS_REG)
>>>>>                  mr_type = IB_MR_TYPE_SG_GAPS;
>>>>>          else
>>>>>                  mr_type = IB_MR_TYPE_MEM_REG;
>>>>
>>>> For now, I prefer not using the IB_MR_TYPE_SG_GAPS MR in NVMe/RDMA 
>>>> since in the case of virtual contiguous data buffers it is better to 
>>>> use IB_MR_TYPE_MEM_REG. It gives much better performance. This is 
>>>> the reason I didn't add IB_MR_TYPE_SG_GAPS MR support for NVMe/RDMA.
>>>
>>> I see. I guess it is not *that* trivial then.
>>>
>>>> I actually had a plan to re-write the IB_MR_TYPE_SG_GAPS MR logic 
>>>> (or create a new MR type) that will internally open 2 MRs so if the 
>>>> IO is contiguous it will use the MTT/MEM_REG and if it isn't it will 
>>>> use the KLM/SG_GAPS.
>>>> This is how we implemented the SIG_MR but still didn't make it for 
>>>> the IB_MR_TYPE_SG_GAPS MR.
>>>
>>> Sounds like a reasonable option. But doesn't think mean that the
>>> driver will need to scan the page scatterlist to determine what internal
>>> mr to use? Even a fallback mechanism can be affected by a given
>>> workload. Plus there is the cost of doubling the number of preallocated
>>> mrs.
>>>
>>
>> Scanning the scatterlist is done anyway for mapping purposes so I 
>> don't think it will affect the performance.
>> The cost of doubling the number of MRs is the what we need to pay to 
>> get optimal performance for contig and discontig IOs, I guess..
>>
>>>> Actually, I think we should have the same logic in the NVMe PCI driver:
>>>> if the IOs can be delivered as PRPs then the driver will prepare SQE 
>>>> with PRP. Otherwise, driver will prepare SGL.
>>>> I think that doing the check in the driver for each IO is not so bad 
>>>> and devices will get benefit from it. Usually HW devices like to 
>>>> work with contiguous buffers. If the buffers can't be mapped with 
>>>> PRPs, then the HW will work a bit harder and use SGLs (it is better 
>>>> than doing a bounce buffer in the block layer).
>>>>
>>>> I actually did a POC internally for NVMe/RDMA and created sg_gaps 
>>>> ib_mr and mem_reg ib_mr and checked the buffers mapping for each IO 
>>>> and got a big benefit if the buffers were discontig (used the 
>>>> sg_gaps mr). Also the contig buffers performance didn't degraded 
>>>> because of the check of the buffers mapping.
>>>>
>>>> I created a fio flags that in purpose sends discontig IOs for my 
>>>> testing.
>>>>
>>>> WDYT ?
>>>
>>> Sounds possible. However for rdma we probably want this transparent to
>>> the ulp such that all consumers can have this benefit. Also perhaps add
>>> this logic in the rdma core so other drivers can use it as well
>>> (although I don't know if any other rdma driver supports sg gaps
>>> anyways).
>>>
>>> If this proves to be a good approach, pci can do something similar.
>>
>> For RDMA, I plan to do it in the device driver (mlx5) layer and not 
>> the ib_core layer. It is unique to our implementation.
> 
> Well, SG_GAPS is not intended to be a unique capability (although it is
> today in practice I guess).

The uniqueness I meant is to use 2 MRs/Mkeys to implement it. Maybe 
there are devices that can do it in a single MR/mkey.

> 
>>
>> For the NVMe PCI case, I suggested doing it unrelated to the NVMe/RDMA 
>> solution. The NVMe/PCI is actually the device driver of the PCI device 
>> and the scanning of the scatterlist should happen in the device driver.
>> I suggest to try this solution since we always debating about 
>> thresholds and when to use SGLs.
>> Now that Christoph opens the gate for the driver to work with 
>> discontig IOs I believe that for *any* discontig IO we should use SGLs 
>> and for *any* contig IO we should use PRPs.
> 
> Why *any* contig IO? There are certainly cases where sgls would perform
> better with sgls than prps I'd assume... For example if a large buffer
> is physically contiguous (say a huge page)?

Yup, PRP is limited to MPS..

There probably should be some capability added to NVMe Spec to help 
drivers to decide the optimal IO PSDT to use.

So I guess we can say for now:
if scatterlist is discontig or data_size > sgl_threshold - use sgls
else use PRPs


^ permalink raw reply	[flat|nested] 13+ messages in thread

end of thread, other threads:[~2023-12-25 12:31 UTC | newest]

Thread overview: 13+ messages (download: mbox.gz follow: Atom feed
-- links below jump to the message on this page --
2023-12-21  8:48 [PATCH] nvme: don't set a virt_boundary unless needed Christoph Hellwig
2023-12-21  9:30 ` Sagi Grimberg
2023-12-21 12:17   ` Christoph Hellwig
2023-12-21 12:32     ` Sagi Grimberg
2023-12-21 12:40       ` Christoph Hellwig
2023-12-25  9:13         ` Sagi Grimberg
2023-12-21 17:03     ` Keith Busch
2023-12-25  9:20       ` Sagi Grimberg
2023-12-22  1:16   ` Max Gurtovoy
2023-12-25 10:08     ` Sagi Grimberg
2023-12-25 10:36       ` Max Gurtovoy
2023-12-25 10:44         ` Sagi Grimberg
2023-12-25 12:31           ` Max Gurtovoy

This is a public inbox, see mirroring instructions
for how to clone and mirror all data and code used for this inbox