public inbox for linux-nvme@lists.infradead.org
 help / color / mirror / Atom feed
* create single-segment HMBs when using IOMMU
@ 2024-11-01  4:40 Christoph Hellwig
  2024-11-01  4:40 ` [PATCH 1/2] nvme-pci: fix freeing of the HMB descriptor table Christoph Hellwig
                   ` (2 more replies)
  0 siblings, 3 replies; 8+ messages in thread
From: Christoph Hellwig @ 2024-11-01  4:40 UTC (permalink / raw)
  To: Keith Busch, Sagi Grimberg; +Cc: linux-nvme

Hi all,

NVMe controllers usually have a limit of how many HMB descriptors they
support, and even if they support multiple they usually prefer less of
them.  When running with an IOMMU, the DMA API can coalesce virtually
discontiguous segments, and using the right API doesn't even eat up
vmalloc space for that.

Diffstat:
 pci.c |   74 +++++++++++++++++++++++++++++++++++++++++++++++++++++++-----------
 1 file changed, 62 insertions(+), 12 deletions(-)


^ permalink raw reply	[flat|nested] 8+ messages in thread

* [PATCH 1/2] nvme-pci: fix freeing of the HMB descriptor table
  2024-11-01  4:40 create single-segment HMBs when using IOMMU Christoph Hellwig
@ 2024-11-01  4:40 ` Christoph Hellwig
  2024-11-01  4:40 ` [PATCH 2/2] nvme-pci: use dma_alloc_noncontigous if possible Christoph Hellwig
  2024-11-05 15:59 ` create single-segment HMBs when using IOMMU Keith Busch
  2 siblings, 0 replies; 8+ messages in thread
From: Christoph Hellwig @ 2024-11-01  4:40 UTC (permalink / raw)
  To: Keith Busch, Sagi Grimberg; +Cc: linux-nvme

The HMB descriptor table is sized to the maximum number of descriptors
that could be used for a given device, but __nvme_alloc_host_mem could
break out of the loop earlier on memory allocation failure and end up
using less descriptors than planned for, which leads to an incorrect
size passed to dma_free_coherent.

In practice this was not showing up because the number of descriptors
tends to be low and the dma coherent allocator always allocates and
frees at least a page.

Fixes: 87ad72a59a38 ("nvme-pci: implement host memory buffer support")
Signed-off-by: Christoph Hellwig <hch@lst.de>
---
 drivers/nvme/host/pci.c | 16 +++++++++-------
 1 file changed, 9 insertions(+), 7 deletions(-)

diff --git a/drivers/nvme/host/pci.c b/drivers/nvme/host/pci.c
index 4b9fda0b1d9a..34daf6d8db07 100644
--- a/drivers/nvme/host/pci.c
+++ b/drivers/nvme/host/pci.c
@@ -153,6 +153,7 @@ struct nvme_dev {
 	/* host memory buffer support: */
 	u64 host_mem_size;
 	u32 nr_host_mem_descs;
+	u32 host_mem_descs_size;
 	dma_addr_t host_mem_descs_dma;
 	struct nvme_host_mem_buf_desc *host_mem_descs;
 	void **host_mem_desc_bufs;
@@ -1966,10 +1967,10 @@ static void nvme_free_host_mem(struct nvme_dev *dev)
 
 	kfree(dev->host_mem_desc_bufs);
 	dev->host_mem_desc_bufs = NULL;
-	dma_free_coherent(dev->dev,
-			dev->nr_host_mem_descs * sizeof(*dev->host_mem_descs),
+	dma_free_coherent(dev->dev, dev->host_mem_descs_size,
 			dev->host_mem_descs, dev->host_mem_descs_dma);
 	dev->host_mem_descs = NULL;
+	dev->host_mem_descs_size = 0;
 	dev->nr_host_mem_descs = 0;
 }
 
@@ -1977,7 +1978,7 @@ static int __nvme_alloc_host_mem(struct nvme_dev *dev, u64 preferred,
 		u32 chunk_size)
 {
 	struct nvme_host_mem_buf_desc *descs;
-	u32 max_entries, len;
+	u32 max_entries, len, descs_size;
 	dma_addr_t descs_dma;
 	int i = 0;
 	void **bufs;
@@ -1990,8 +1991,9 @@ static int __nvme_alloc_host_mem(struct nvme_dev *dev, u64 preferred,
 	if (dev->ctrl.hmmaxd && dev->ctrl.hmmaxd < max_entries)
 		max_entries = dev->ctrl.hmmaxd;
 
-	descs = dma_alloc_coherent(dev->dev, max_entries * sizeof(*descs),
-				   &descs_dma, GFP_KERNEL);
+	descs_size = max_entries * sizeof(*descs);
+	descs = dma_alloc_coherent(dev->dev, descs_size, &descs_dma,
+			GFP_KERNEL);
 	if (!descs)
 		goto out;
 
@@ -2020,6 +2022,7 @@ static int __nvme_alloc_host_mem(struct nvme_dev *dev, u64 preferred,
 	dev->host_mem_size = size;
 	dev->host_mem_descs = descs;
 	dev->host_mem_descs_dma = descs_dma;
+	dev->host_mem_descs_size = descs_size;
 	dev->host_mem_desc_bufs = bufs;
 	return 0;
 
@@ -2034,8 +2037,7 @@ static int __nvme_alloc_host_mem(struct nvme_dev *dev, u64 preferred,
 
 	kfree(bufs);
 out_free_descs:
-	dma_free_coherent(dev->dev, max_entries * sizeof(*descs), descs,
-			descs_dma);
+	dma_free_coherent(dev->dev, descs_size, descs, descs_dma);
 out:
 	dev->host_mem_descs = NULL;
 	return -ENOMEM;
-- 
2.45.2



^ permalink raw reply related	[flat|nested] 8+ messages in thread

* [PATCH 2/2] nvme-pci: use dma_alloc_noncontigous if possible
  2024-11-01  4:40 create single-segment HMBs when using IOMMU Christoph Hellwig
  2024-11-01  4:40 ` [PATCH 1/2] nvme-pci: fix freeing of the HMB descriptor table Christoph Hellwig
@ 2024-11-01  4:40 ` Christoph Hellwig
  2024-12-02 19:05   ` Leon Romanovsky
  2024-11-05 15:59 ` create single-segment HMBs when using IOMMU Keith Busch
  2 siblings, 1 reply; 8+ messages in thread
From: Christoph Hellwig @ 2024-11-01  4:40 UTC (permalink / raw)
  To: Keith Busch, Sagi Grimberg; +Cc: linux-nvme

Use dma_alloc_noncontigous to allocate a single IOVA-contigous segment
when backed by an IOMMU.  This allow to easily use bigger segments and
avoids running into segment limits if we can avoid it.

Signed-off-by: Christoph Hellwig <hch@lst.de>
---
 drivers/nvme/host/pci.c | 58 +++++++++++++++++++++++++++++++++++++----
 1 file changed, 53 insertions(+), 5 deletions(-)

diff --git a/drivers/nvme/host/pci.c b/drivers/nvme/host/pci.c
index 34daf6d8db07..0aa26a33f231 100644
--- a/drivers/nvme/host/pci.c
+++ b/drivers/nvme/host/pci.c
@@ -141,6 +141,7 @@ struct nvme_dev {
 	struct nvme_ctrl ctrl;
 	u32 last_ps;
 	bool hmb;
+	struct sg_table *hmb_sgt;
 
 	mempool_t *iod_mempool;
 
@@ -1952,7 +1953,7 @@ static int nvme_set_host_mem(struct nvme_dev *dev, u32 bits)
 	return ret;
 }
 
-static void nvme_free_host_mem(struct nvme_dev *dev)
+static void nvme_free_host_mem_multi(struct nvme_dev *dev)
 {
 	int i;
 
@@ -1967,6 +1968,16 @@ static void nvme_free_host_mem(struct nvme_dev *dev)
 
 	kfree(dev->host_mem_desc_bufs);
 	dev->host_mem_desc_bufs = NULL;
+}
+
+static void nvme_free_host_mem(struct nvme_dev *dev)
+{
+	if (dev->hmb_sgt)
+		dma_free_noncontiguous(dev->dev, dev->host_mem_size,
+				dev->hmb_sgt, DMA_BIDIRECTIONAL);
+	else
+		nvme_free_host_mem_multi(dev);
+
 	dma_free_coherent(dev->dev, dev->host_mem_descs_size,
 			dev->host_mem_descs, dev->host_mem_descs_dma);
 	dev->host_mem_descs = NULL;
@@ -1974,7 +1985,33 @@ static void nvme_free_host_mem(struct nvme_dev *dev)
 	dev->nr_host_mem_descs = 0;
 }
 
-static int __nvme_alloc_host_mem(struct nvme_dev *dev, u64 preferred,
+static int nvme_alloc_host_mem_single(struct nvme_dev *dev, u64 size)
+{
+	dev->hmb_sgt = dma_alloc_noncontiguous(dev->dev, size,
+				DMA_BIDIRECTIONAL, GFP_KERNEL, 0);
+	if (!dev->hmb_sgt)
+		return -ENOMEM;
+
+	dev->host_mem_descs = dma_alloc_coherent(dev->dev,
+			sizeof(*dev->host_mem_descs), &dev->host_mem_descs_dma,
+			GFP_KERNEL);
+	if (!dev->host_mem_descs) {
+		dma_free_noncontiguous(dev->dev, dev->host_mem_size,
+				dev->hmb_sgt, DMA_BIDIRECTIONAL);
+		dev->hmb_sgt = NULL;
+		return -ENOMEM;
+	}
+	dev->host_mem_size = size;
+	dev->host_mem_descs_size = sizeof(*dev->host_mem_descs);
+	dev->nr_host_mem_descs = 1;
+
+	dev->host_mem_descs[0].addr =
+		cpu_to_le64(dev->hmb_sgt->sgl->dma_address);
+	dev->host_mem_descs[0].size = cpu_to_le32(size / NVME_CTRL_PAGE_SIZE);
+	return 0;
+}
+
+static int nvme_alloc_host_mem_multi(struct nvme_dev *dev, u64 preferred,
 		u32 chunk_size)
 {
 	struct nvme_host_mem_buf_desc *descs;
@@ -2049,9 +2086,18 @@ static int nvme_alloc_host_mem(struct nvme_dev *dev, u64 min, u64 preferred)
 	u64 hmminds = max_t(u32, dev->ctrl.hmminds * 4096, PAGE_SIZE * 2);
 	u64 chunk_size;
 
+	/*
+	 * If there is an IOMMU that can merge pages, try a virtually
+	 * non-contiguous allocation for a single segment first.
+	 */
+	if (!(PAGE_SIZE & dma_get_merge_boundary(dev->dev))) {
+		if (!nvme_alloc_host_mem_single(dev, preferred))
+			return 0;
+	}
+
 	/* start big and work our way down */
 	for (chunk_size = min_chunk; chunk_size >= hmminds; chunk_size /= 2) {
-		if (!__nvme_alloc_host_mem(dev, preferred, chunk_size)) {
+		if (!nvme_alloc_host_mem_multi(dev, preferred, chunk_size)) {
 			if (!min || dev->host_mem_size >= min)
 				return 0;
 			nvme_free_host_mem(dev);
@@ -2099,8 +2145,10 @@ static int nvme_setup_host_mem(struct nvme_dev *dev)
 		}
 
 		dev_info(dev->ctrl.device,
-			"allocated %lld MiB host memory buffer.\n",
-			dev->host_mem_size >> ilog2(SZ_1M));
+			"allocated %lld MiB host memory buffer (%u segment%s).\n",
+			dev->host_mem_size >> ilog2(SZ_1M),
+			dev->nr_host_mem_descs,
+			str_plural(dev->nr_host_mem_descs));
 	}
 
 	ret = nvme_set_host_mem(dev, enable_bits);
-- 
2.45.2



^ permalink raw reply related	[flat|nested] 8+ messages in thread

* Re: create single-segment HMBs when using IOMMU
  2024-11-01  4:40 create single-segment HMBs when using IOMMU Christoph Hellwig
  2024-11-01  4:40 ` [PATCH 1/2] nvme-pci: fix freeing of the HMB descriptor table Christoph Hellwig
  2024-11-01  4:40 ` [PATCH 2/2] nvme-pci: use dma_alloc_noncontigous if possible Christoph Hellwig
@ 2024-11-05 15:59 ` Keith Busch
  2 siblings, 0 replies; 8+ messages in thread
From: Keith Busch @ 2024-11-05 15:59 UTC (permalink / raw)
  To: Christoph Hellwig; +Cc: Sagi Grimberg, linux-nvme

On Fri, Nov 01, 2024 at 05:40:03AM +0100, Christoph Hellwig wrote:
> Hi all,
> 
> NVMe controllers usually have a limit of how many HMB descriptors they
> support, and even if they support multiple they usually prefer less of
> them.  When running with an IOMMU, the DMA API can coalesce virtually
> discontiguous segments, and using the right API doesn't even eat up
> vmalloc space for that.

This all looks good to me. Thanks, applied to nvme-6.13.


^ permalink raw reply	[flat|nested] 8+ messages in thread

* Re: [PATCH 2/2] nvme-pci: use dma_alloc_noncontigous if possible
  2024-11-01  4:40 ` [PATCH 2/2] nvme-pci: use dma_alloc_noncontigous if possible Christoph Hellwig
@ 2024-12-02 19:05   ` Leon Romanovsky
  2024-12-02 19:15     ` Keith Busch
  2024-12-03  0:33     ` Christoph Hellwig
  0 siblings, 2 replies; 8+ messages in thread
From: Leon Romanovsky @ 2024-12-02 19:05 UTC (permalink / raw)
  To: Christoph Hellwig; +Cc: Keith Busch, Sagi Grimberg, linux-nvme, Gal Pressman

On Fri, Nov 01, 2024 at 05:40:05AM +0100, Christoph Hellwig wrote:
> Use dma_alloc_noncontigous to allocate a single IOVA-contigous segment
> when backed by an IOMMU.  This allow to easily use bigger segments and
> avoids running into segment limits if we can avoid it.
> 
> Signed-off-by: Christoph Hellwig <hch@lst.de>
> ---
>  drivers/nvme/host/pci.c | 58 +++++++++++++++++++++++++++++++++++++----
>  1 file changed, 53 insertions(+), 5 deletions(-)

<...>

> +static int nvme_alloc_host_mem_multi(struct nvme_dev *dev, u64 preferred,
>  		u32 chunk_size)
>  {
>  	struct nvme_host_mem_buf_desc *descs;
> @@ -2049,9 +2086,18 @@ static int nvme_alloc_host_mem(struct nvme_dev *dev, u64 min, u64 preferred)
>  	u64 hmminds = max_t(u32, dev->ctrl.hmminds * 4096, PAGE_SIZE * 2);
>  	u64 chunk_size;
>  
> +	/*
> +	 * If there is an IOMMU that can merge pages, try a virtually
> +	 * non-contiguous allocation for a single segment first.
> +	 */
> +	if (!(PAGE_SIZE & dma_get_merge_boundary(dev->dev))) {
> +		if (!nvme_alloc_host_mem_single(dev, preferred))
> +			return 0;
> +	}

We assume that the addition of the lines above are the root cause of the
following panic during boot. It is happening when we are trying to allocate
61 MiB chunk.

[    4.373307] ------------[ cut here ]------------
[    4.373316] WARNING: CPU: 5 PID: 11 at mm/page_alloc.c:4727 __alloc_pages_noprof+0x84c/0xd88
[    4.373332] Modules linked in: crct10dif_ce mlx5_core(+) nvme gpio_mlxbf3 nvme_core mlxfw psample i2c_mlxbf pinctrl_mlxbf3 mlxbf_gige mlxbf_tmfifo pwr_mlxbf ipv6 crc_ccitt
[    4.373353] CPU: 5 UID: 0 PID: 11 Comm: kworker/u64:0 Not tainted 6.12.0-for-upstream-bluefield-2024-11-29-01-33 #1
[    4.373357] Hardware name: https://www.mellanox.com BlueField-3 SmartNIC Main Card/BlueField-3 SmartNIC Main Card, BIOS 4.9.0.13378 Oct 30 2024
[    4.373360] Workqueue: async async_run_entry_fn
[    4.373365] pstate: 20400009 (nzCv daif +PAN -UAO -TCO -DIT -SSBS BTYPE=--)
[    4.373368] pc : __alloc_pages_noprof+0x84c/0xd88
[    4.373371] lr : __dma_direct_alloc_pages.constprop.0+0x234/0x358
[    4.373377] sp : ffffffc08011b890
[    4.373378] x29: ffffffc08011b890 x28: 000000000000000e x27: 0000000003d00000
[    4.373382] x26: ffffff80803cb840 x25: ffffff808197a0c8 x24: 000000000000000e
[    4.373385] x23: 0000000000000cc1 x22: 00000000ffffffff x21: 0000000003cfffff
[    4.373388] x20: 0000000000000000 x19: ffffffffffffffff x18: 0000000000000100
[    4.373391] x17: 0030737973627573 x16: ffffffd634e9d488 x15: 0000000000003a98
[    4.373394] x14: 0000000013ffffff x13: ffffffd636c18d88 x12: 0000000000000001
[    4.373396] x11: 0000000104ab200c x10: f56b3ce21ad3b435 x9 : ffffffd634e9ecbc
[    4.373399] x8 : ffffff808647ba80 x7 : ffffffffffffffff x6 : 0000000000000cc0
[    4.373402] x5 : 0000000000000000 x4 : ffffff80809b9140 x3 : 0000000000000000
[    4.373405] x2 : 0000000000000000 x1 : 0000000000000001 x0 : ffffffd636e5d000
[    4.373408] Call trace:
[    4.373410]  __alloc_pages_noprof+0x84c/0xd88 (P)
[    4.373414]  __dma_direct_alloc_pages.constprop.0+0x234/0x358 (L)
[    4.373418]  __dma_direct_alloc_pages.constprop.0+0x234/0x358
[    4.373421]  dma_direct_alloc_pages+0x40/0x190
[    4.373424]  __dma_alloc_pages+0x40/0x80
[    4.373428]  dma_alloc_noncontiguous+0xb4/0x218
[    4.373431]  nvme_setup_host_mem+0x370/0x400 [nvme]
[    4.373442]  nvme_probe+0x688/0x7e8 [nvme]
[    4.373446]  local_pci_probe+0x48/0xb8
[    4.373451]  pci_device_probe+0x1e0/0x200
[    4.373454]  really_probe+0xc8/0x3a0
[    4.373457]  __driver_probe_device+0x84/0x170
[    4.373460]  driver_probe_device+0x44/0x120
[    4.373462]  __driver_attach_async_helper+0x58/0x100
[    4.373465]  async_run_entry_fn+0x40/0x1e8
[    4.373468]  process_one_work+0x16c/0x3e8
[    4.373472]  worker_thread+0x284/0x448
[    4.373476]  kthread+0xec/0xf8
[    4.373479]  ret_from_fork+0x10/0x20
[    4.373483] ---[ end trace 0000000000000000 ]---
[    4.378989] nvme nvme0: allocated 61 MiB host memory buffer (16 segments).
[    4.534672] nvme nvme0: 16/0/0 default/read/poll queues
[    4.537784]  nvme0n1: p1 p2

  4715 struct page *__alloc_pages_noprof(gfp_t gfp, unsigned int order,
  4716                                       int preferred_nid, nodemask_t *nodemask)
...
  4723         /*
  4724          * There are several places where we assume that the order value is sane
  4725          * so bail out early if the request is out of bound.
  4726          */
  4727         if (WARN_ON_ONCE_GFP(order > MAX_PAGE_ORDER, gfp))
  4728                 return NULL;

I see at least two possible solutions, add GFP_NOWARN in nvme_alloc_host_mem_single()
or the following patch:

diff --git a/drivers/nvme/host/pci.c b/drivers/nvme/host/pci.c
index 4c644bb7f069..baed4059d8a5 100644
--- a/drivers/nvme/host/pci.c
+++ b/drivers/nvme/host/pci.c
@@ -2172,7 +2172,8 @@ static int nvme_alloc_host_mem_multi(struct nvme_dev *dev, u64 preferred,

 static int nvme_alloc_host_mem(struct nvme_dev *dev, u64 min, u64 preferred)
 {
-       u64 min_chunk = min_t(u64, preferred, PAGE_SIZE * MAX_ORDER_NR_PAGES);
+       u64 max_chunk = PAGE_SIZE * MAX_ORDER_NR_PAGES;
+       u64 min_chunk = min_t(u64, preferred, max_chunk);
        u64 hmminds = max_t(u32, dev->ctrl.hmminds * 4096, PAGE_SIZE * 2);
        u64 chunk_size;

@@ -2180,7 +2181,7 @@ static int nvme_alloc_host_mem(struct nvme_dev *dev, u64 min, u64 preferred)
         * If there is an IOMMU that can merge pages, try a virtually
         * non-contiguous allocation for a single segment first.
         */
-       if (!(PAGE_SIZE & dma_get_merge_boundary(dev->dev))) {
+       if (!(PAGE_SIZE & dma_get_merge_boundary(dev->dev)) && preferred < max_chunk) {
                if (!nvme_alloc_host_mem_single(dev, preferred))
                        return 0;
        }
(END)

What is the preferred way to overcome the warning?

Thanks


^ permalink raw reply related	[flat|nested] 8+ messages in thread

* Re: [PATCH 2/2] nvme-pci: use dma_alloc_noncontigous if possible
  2024-12-02 19:05   ` Leon Romanovsky
@ 2024-12-02 19:15     ` Keith Busch
  2024-12-02 19:20       ` Leon Romanovsky
  2024-12-03  0:33     ` Christoph Hellwig
  1 sibling, 1 reply; 8+ messages in thread
From: Keith Busch @ 2024-12-02 19:15 UTC (permalink / raw)
  To: Leon Romanovsky
  Cc: Christoph Hellwig, Sagi Grimberg, linux-nvme, Gal Pressman

On Mon, Dec 02, 2024 at 09:05:41PM +0200, Leon Romanovsky wrote:
> I see at least two possible solutions, add GFP_NOWARN in nvme_alloc_host_mem_single()
> or the following patch:
> 
> diff --git a/drivers/nvme/host/pci.c b/drivers/nvme/host/pci.c
> index 4c644bb7f069..baed4059d8a5 100644
> --- a/drivers/nvme/host/pci.c
> +++ b/drivers/nvme/host/pci.c
> @@ -2172,7 +2172,8 @@ static int nvme_alloc_host_mem_multi(struct nvme_dev *dev, u64 preferred,
> 
>  static int nvme_alloc_host_mem(struct nvme_dev *dev, u64 min, u64 preferred)
>  {
> -       u64 min_chunk = min_t(u64, preferred, PAGE_SIZE * MAX_ORDER_NR_PAGES);
> +       u64 max_chunk = PAGE_SIZE * MAX_ORDER_NR_PAGES;
> +       u64 min_chunk = min_t(u64, preferred, max_chunk);
>         u64 hmminds = max_t(u32, dev->ctrl.hmminds * 4096, PAGE_SIZE * 2);
>         u64 chunk_size;
> 
> @@ -2180,7 +2181,7 @@ static int nvme_alloc_host_mem(struct nvme_dev *dev, u64 min, u64 preferred)
>          * If there is an IOMMU that can merge pages, try a virtually
>          * non-contiguous allocation for a single segment first.
>          */
> -       if (!(PAGE_SIZE & dma_get_merge_boundary(dev->dev))) {
> +       if (!(PAGE_SIZE & dma_get_merge_boundary(dev->dev)) && preferred < max_chunk) {
>                 if (!nvme_alloc_host_mem_single(dev, preferred))
>                         return 0;
>         }
> (END)
> 
> What is the preferred way to overcome the warning?

I think your max_chunk check suggestion looks pretty good.


^ permalink raw reply	[flat|nested] 8+ messages in thread

* Re: [PATCH 2/2] nvme-pci: use dma_alloc_noncontigous if possible
  2024-12-02 19:15     ` Keith Busch
@ 2024-12-02 19:20       ` Leon Romanovsky
  0 siblings, 0 replies; 8+ messages in thread
From: Leon Romanovsky @ 2024-12-02 19:20 UTC (permalink / raw)
  To: Keith Busch; +Cc: Christoph Hellwig, Sagi Grimberg, linux-nvme, Gal Pressman

On Mon, Dec 02, 2024 at 12:15:49PM -0700, Keith Busch wrote:
> On Mon, Dec 02, 2024 at 09:05:41PM +0200, Leon Romanovsky wrote:
> > I see at least two possible solutions, add GFP_NOWARN in nvme_alloc_host_mem_single()
> > or the following patch:
> > 
> > diff --git a/drivers/nvme/host/pci.c b/drivers/nvme/host/pci.c
> > index 4c644bb7f069..baed4059d8a5 100644
> > --- a/drivers/nvme/host/pci.c
> > +++ b/drivers/nvme/host/pci.c
> > @@ -2172,7 +2172,8 @@ static int nvme_alloc_host_mem_multi(struct nvme_dev *dev, u64 preferred,
> > 
> >  static int nvme_alloc_host_mem(struct nvme_dev *dev, u64 min, u64 preferred)
> >  {
> > -       u64 min_chunk = min_t(u64, preferred, PAGE_SIZE * MAX_ORDER_NR_PAGES);
> > +       u64 max_chunk = PAGE_SIZE * MAX_ORDER_NR_PAGES;
> > +       u64 min_chunk = min_t(u64, preferred, max_chunk);
> >         u64 hmminds = max_t(u32, dev->ctrl.hmminds * 4096, PAGE_SIZE * 2);
> >         u64 chunk_size;
> > 
> > @@ -2180,7 +2181,7 @@ static int nvme_alloc_host_mem(struct nvme_dev *dev, u64 min, u64 preferred)
> >          * If there is an IOMMU that can merge pages, try a virtually
> >          * non-contiguous allocation for a single segment first.
> >          */
> > -       if (!(PAGE_SIZE & dma_get_merge_boundary(dev->dev))) {
> > +       if (!(PAGE_SIZE & dma_get_merge_boundary(dev->dev)) && preferred < max_chunk) {
> >                 if (!nvme_alloc_host_mem_single(dev, preferred))
> >                         return 0;
> >         }
> > (END)
> > 
> > What is the preferred way to overcome the warning?
> 
> I think your max_chunk check suggestion looks pretty good.

Awesome, will send proper patch tomorrow.

Thanks


^ permalink raw reply	[flat|nested] 8+ messages in thread

* Re: [PATCH 2/2] nvme-pci: use dma_alloc_noncontigous if possible
  2024-12-02 19:05   ` Leon Romanovsky
  2024-12-02 19:15     ` Keith Busch
@ 2024-12-03  0:33     ` Christoph Hellwig
  1 sibling, 0 replies; 8+ messages in thread
From: Christoph Hellwig @ 2024-12-03  0:33 UTC (permalink / raw)
  To: Leon Romanovsky
  Cc: Christoph Hellwig, Keith Busch, Sagi Grimberg, linux-nvme,
	Gal Pressman

On Mon, Dec 02, 2024 at 09:05:41PM +0200, Leon Romanovsky wrote:
> On Fri, Nov 01, 2024 at 05:40:05AM +0100, Christoph Hellwig wrote:
> > Use dma_alloc_noncontigous to allocate a single IOVA-contigous segment
> > when backed by an IOMMU.  This allow to easily use bigger segments and
> > avoids running into segment limits if we can avoid it.
> > 
> > Signed-off-by: Christoph Hellwig <hch@lst.de>
> > ---
> >  drivers/nvme/host/pci.c | 58 +++++++++++++++++++++++++++++++++++++----
> >  1 file changed, 53 insertions(+), 5 deletions(-)
> 
> <...>
> 
> > +static int nvme_alloc_host_mem_multi(struct nvme_dev *dev, u64 preferred,
> >  		u32 chunk_size)
> >  {
> >  	struct nvme_host_mem_buf_desc *descs;
> > @@ -2049,9 +2086,18 @@ static int nvme_alloc_host_mem(struct nvme_dev *dev, u64 min, u64 preferred)
> >  	u64 hmminds = max_t(u32, dev->ctrl.hmminds * 4096, PAGE_SIZE * 2);
> >  	u64 chunk_size;
> >  
> > +	/*
> > +	 * If there is an IOMMU that can merge pages, try a virtually
> > +	 * non-contiguous allocation for a single segment first.
> > +	 */
> > +	if (!(PAGE_SIZE & dma_get_merge_boundary(dev->dev))) {
> > +		if (!nvme_alloc_host_mem_single(dev, preferred))
> > +			return 0;
> > +	}
> 
> We assume that the addition of the lines above are the root cause of the
> following panic during boot. It is happening when we are trying to allocate
> 61 MiB chunk.

We should not hit this path for dma-direct, but only dma-iommu that can
do discotigous allocations.  If we hit it with dma-direct I got my math
on the boundary check above wrong.  I'll try to figure out what is
wrong, but I'm actually in Japan for meetings, so things will not only
be delayed but also in a weird time zone.

> -       if (!(PAGE_SIZE & dma_get_merge_boundary(dev->dev))) {
> +       if (!(PAGE_SIZE & dma_get_merge_boundary(dev->dev)) && preferred < max_chunk) {

That would basically be a revert of the new funtionality..


^ permalink raw reply	[flat|nested] 8+ messages in thread

end of thread, other threads:[~2024-12-03  0:34 UTC | newest]

Thread overview: 8+ messages (download: mbox.gz follow: Atom feed
-- links below jump to the message on this page --
2024-11-01  4:40 create single-segment HMBs when using IOMMU Christoph Hellwig
2024-11-01  4:40 ` [PATCH 1/2] nvme-pci: fix freeing of the HMB descriptor table Christoph Hellwig
2024-11-01  4:40 ` [PATCH 2/2] nvme-pci: use dma_alloc_noncontigous if possible Christoph Hellwig
2024-12-02 19:05   ` Leon Romanovsky
2024-12-02 19:15     ` Keith Busch
2024-12-02 19:20       ` Leon Romanovsky
2024-12-03  0:33     ` Christoph Hellwig
2024-11-05 15:59 ` create single-segment HMBs when using IOMMU Keith Busch

This is a public inbox, see mirroring instructions
for how to clone and mirror all data and code used for this inbox