public inbox for netdev@vger.kernel.org
 help / color / mirror / Atom feed
From: Tariq Toukan <tariqt@nvidia.com>
To: Eric Dumazet <edumazet@google.com>,
	Jakub Kicinski <kuba@kernel.org>, Paolo Abeni <pabeni@redhat.com>,
	Andrew Lunn <andrew+netdev@lunn.ch>,
	"David S. Miller" <davem@davemloft.net>
Cc: Saeed Mahameed <saeedm@nvidia.com>,
	Leon Romanovsky <leon@kernel.org>,
	Tariq Toukan <tariqt@nvidia.com>, Mark Bloch <mbloch@nvidia.com>,
	<netdev@vger.kernel.org>, <linux-rdma@vger.kernel.org>,
	<linux-kernel@vger.kernel.org>, Gal Pressman <gal@nvidia.com>,
	Dragos Tatulea <dtatulea@nvidia.com>,
	Moshe Shemesh <moshe@nvidia.com>, Nimrod Oren <noren@nvidia.com>
Subject: [PATCH net-next 3/3] net/mlx5: use internal dma pools for frag buf alloc
Date: Tue, 28 Apr 2026 08:29:20 +0300	[thread overview]
Message-ID: <20260428052920.219201-4-tariqt@nvidia.com> (raw)
In-Reply-To: <20260428052920.219201-1-tariqt@nvidia.com>

From: Nimrod Oren <noren@nvidia.com>

Add mlx5_dma_pool alloc/free paths, and wire mlx5_frag_buf allocation
and free paths to use them.

mlx5_frag_buf_alloc_node() now selects an mlx5_dma_pool to allocate
fragments from, instead of directly allocating full coherent pages.

mlx5_frag_buf_free() frees from the respective pool.

mlx5_dma_pool_alloc() keeps allocation fast by maintaining pages with
available indexes at the head of the list, so the common allocation path
can take a free index immediately. New backing pages are allocated only
when no free index is available.

mlx5_dma_pool_free() returns released indexes to the pool and frees a
backing page once all of its indexes become free. This avoids keeping
fully free pages for the lifetime of the pool and reduces coherent DMA
memory footprint.

Signed-off-by: Nimrod Oren <noren@nvidia.com>
Signed-off-by: Tariq Toukan <tariqt@nvidia.com>
---
 .../net/ethernet/mellanox/mlx5/core/alloc.c   | 185 ++++++++++++++----
 include/linux/mlx5/driver.h                   |   2 +
 2 files changed, 154 insertions(+), 33 deletions(-)

diff --git a/drivers/net/ethernet/mellanox/mlx5/core/alloc.c b/drivers/net/ethernet/mellanox/mlx5/core/alloc.c
index 918cf027bcbc..5cced45caf36 100644
--- a/drivers/net/ethernet/mellanox/mlx5/core/alloc.c
+++ b/drivers/net/ethernet/mellanox/mlx5/core/alloc.c
@@ -97,10 +97,44 @@ static void *mlx5_dma_zalloc_coherent_node(struct mlx5_core_dev *dev,
 	return cpu_handle;
 }
 
-/* Implemented later in the series */
+static struct mlx5_dma_pool_page *
+mlx5_dma_pool_page_alloc(struct mlx5_dma_pool *pool)
+{
+	int blocks_per_page = BIT(PAGE_SHIFT - pool->block_shift);
+	struct mlx5_dma_pool_page *page;
+
+	page = kzalloc_obj(*page);
+	if (!page)
+		goto err_out;
+
+	page->pool = pool;
+	page->bitmap = bitmap_zalloc(blocks_per_page, GFP_KERNEL);
+	if (!page->bitmap)
+		goto err_free_page;
+
+	bitmap_fill(page->bitmap, blocks_per_page);
+	page->buf = mlx5_dma_zalloc_coherent_node(pool->dev, PAGE_SIZE,
+						  &page->dma, pool->node);
+	if (!page->buf)
+		goto err_free_bitmap;
+
+	return page;
+
+err_free_bitmap:
+	bitmap_free(page->bitmap);
+err_free_page:
+	kfree(page);
+err_out:
+	return NULL;
+}
+
 static void mlx5_dma_pool_page_free(struct mlx5_core_dev *dev,
 				    struct mlx5_dma_pool_page *page)
 {
+	dma_free_coherent(mlx5_core_dma_dev(dev), PAGE_SIZE, page->buf,
+			  page->dma);
+	bitmap_free(page->bitmap);
+	kfree(page);
 }
 
 static void mlx5_dma_pool_destroy(struct mlx5_dma_pool *pool)
@@ -142,6 +176,83 @@ static struct mlx5_dma_pool *mlx5_dma_pool_create(struct mlx5_core_dev *dev,
 	return pool;
 }
 
+static int mlx5_dma_pool_alloc_from_page(struct mlx5_dma_pool *pool,
+					 struct mlx5_dma_pool_page *page,
+					 unsigned long *idx_out)
+{
+	int blocks_per_page = BIT(PAGE_SHIFT - pool->block_shift);
+
+	*idx_out = find_first_bit(page->bitmap, blocks_per_page);
+	if (*idx_out >= blocks_per_page)
+		return -ENOMEM;
+
+	__clear_bit(*idx_out, page->bitmap);
+
+	if (bitmap_empty(page->bitmap, blocks_per_page))
+		list_move_tail(&page->pool_link, &pool->page_list);
+
+	return 0;
+}
+
+static struct mlx5_dma_pool_page *
+mlx5_dma_pool_alloc(struct mlx5_dma_pool *pool, unsigned long *idx_out)
+{
+	struct mlx5_dma_pool_page *page;
+
+	mutex_lock(&pool->lock);
+
+	page = list_first_entry_or_null(&pool->page_list,
+					struct mlx5_dma_pool_page, pool_link);
+	if (page && !mlx5_dma_pool_alloc_from_page(pool, page, idx_out))
+		goto unlock; /* successfully allocated from existing page */
+
+	page = mlx5_dma_pool_page_alloc(pool);
+	if (!page)
+		goto unlock;
+
+	list_add(&page->pool_link, &pool->page_list);
+	mlx5_dma_pool_alloc_from_page(pool, page, idx_out);
+
+unlock:
+	mutex_unlock(&pool->lock);
+	return page;
+}
+
+static void mlx5_dma_pool_free(struct mlx5_dma_pool *pool,
+			       struct mlx5_dma_pool_page *page,
+			       unsigned long idx)
+{
+	int blocks_per_page = BIT(PAGE_SHIFT - pool->block_shift);
+	bool was_full;
+
+	if (WARN_ONCE(idx >= blocks_per_page,
+		      "mlx5 dma pool invalid idx: %lu (max %d)\n",
+		      idx, blocks_per_page - 1))
+		return;
+
+	mutex_lock(&pool->lock);
+	if (WARN_ONCE(test_bit(idx, page->bitmap),
+		      "mlx5 dma pool double free: idx=%lu block_shift=%u\n",
+		      idx, pool->block_shift))
+		goto unlock;
+
+	was_full = bitmap_empty(page->bitmap, blocks_per_page);
+	__set_bit(idx, page->bitmap);
+
+	if (bitmap_full(page->bitmap, blocks_per_page)) {
+		list_del(&page->pool_link);
+		mlx5_dma_pool_page_free(pool->dev, page);
+	} else {
+		memset((u8 *)page->buf + (idx << pool->block_shift), 0,
+		       BIT(pool->block_shift));
+		if (was_full)
+			list_move(&page->pool_link, &pool->page_list);
+	}
+
+unlock:
+	mutex_unlock(&pool->lock);
+}
+
 static void
 mlx5_frag_buf_node_pools_destroy(struct mlx5_frag_buf_node_pools *node_pools)
 {
@@ -219,56 +330,64 @@ int mlx5_frag_buf_pools_init(struct mlx5_core_dev *dev)
 int mlx5_frag_buf_alloc_node(struct mlx5_core_dev *dev, int size,
 			     struct mlx5_frag_buf *buf, int node)
 {
-	int i;
+	struct mlx5_dma_pool *pool;
+	int pool_idx;
+
+	if (WARN_ONCE(size <= 0, "mlx5_frag_buf non-positive size: %d\n", size))
+		return -EINVAL;
+
+	node = node == NUMA_NO_NODE ? first_online_node : node;
+
+	if (WARN_ONCE(node < 0 || node >= nr_node_ids || !node_possible(node),
+		      "mlx5_frag_buf invalid node ID: %d\n", node))
+		return -EINVAL;
 
 	buf->size = size;
 	buf->npages = DIV_ROUND_UP(size, PAGE_SIZE);
-	buf->page_shift = PAGE_SHIFT;
-	buf->frags = kzalloc_objs(struct mlx5_buf_list, buf->npages);
+	buf->page_shift = clamp_t(int, order_base_2(size),
+				  MLX5_FRAG_BUF_POOL_MIN_BLOCK_SHIFT,
+				  PAGE_SHIFT);
+	buf->frags = kcalloc_node(buf->npages, sizeof(*buf->frags),
+				  GFP_KERNEL, node);
 	if (!buf->frags)
-		goto err_out;
+		return -ENOMEM;
 
-	for (i = 0; i < buf->npages; i++) {
+	pool_idx = buf->page_shift - MLX5_FRAG_BUF_POOL_MIN_BLOCK_SHIFT;
+	pool = dev->priv.frag_buf_node_pools[node]->pools[pool_idx];
+	for (int i = 0; i < buf->npages; i++) {
 		struct mlx5_buf_list *frag = &buf->frags[i];
-		int frag_sz = min_t(int, size, PAGE_SIZE);
+		struct mlx5_dma_pool_page *page;
+		unsigned long idx;
 
-		frag->buf = mlx5_dma_zalloc_coherent_node(dev, frag_sz,
-							  &frag->map, node);
-		if (!frag->buf)
-			goto err_free_buf;
-		if (frag->map & ((1 << buf->page_shift) - 1)) {
-			dma_free_coherent(mlx5_core_dma_dev(dev), frag_sz,
-					  buf->frags[i].buf, buf->frags[i].map);
-			mlx5_core_warn(dev, "unexpected map alignment: %pad, page_shift=%d\n",
-				       &frag->map, buf->page_shift);
-			goto err_free_buf;
+		page = mlx5_dma_pool_alloc(pool, &idx);
+		if (!page) {
+			mlx5_frag_buf_free(dev, buf);
+			return -ENOMEM;
 		}
-		size -= frag_sz;
+		frag->buf = (u8 *)page->buf + (idx << pool->block_shift);
+		frag->map = page->dma + (idx << pool->block_shift);
+		frag->frag_page = page;
 	}
 
 	return 0;
-
-err_free_buf:
-	while (i--)
-		dma_free_coherent(mlx5_core_dma_dev(dev), PAGE_SIZE, buf->frags[i].buf,
-				  buf->frags[i].map);
-	kfree(buf->frags);
-err_out:
-	return -ENOMEM;
 }
 EXPORT_SYMBOL_GPL(mlx5_frag_buf_alloc_node);
 
 void mlx5_frag_buf_free(struct mlx5_core_dev *dev, struct mlx5_frag_buf *buf)
 {
-	int size = buf->size;
-	int i;
+	for (int i = 0; i < buf->npages; i++) {
+		struct mlx5_buf_list *frag = &buf->frags[i];
+		struct mlx5_dma_pool_page *page;
+		struct mlx5_dma_pool *pool;
+		unsigned long idx;
 
-	for (i = 0; i < buf->npages; i++) {
-		int frag_sz = min_t(int, size, PAGE_SIZE);
+		if (!frag->buf)
+			continue;
 
-		dma_free_coherent(mlx5_core_dma_dev(dev), frag_sz, buf->frags[i].buf,
-				  buf->frags[i].map);
-		size -= frag_sz;
+		page = frag->frag_page;
+		pool = page->pool;
+		idx = (frag->map - page->dma) >> pool->block_shift;
+		mlx5_dma_pool_free(pool, page, idx);
 	}
 	kfree(buf->frags);
 }
diff --git a/include/linux/mlx5/driver.h b/include/linux/mlx5/driver.h
index 71f7615ab553..531ce66fc8ef 100644
--- a/include/linux/mlx5/driver.h
+++ b/include/linux/mlx5/driver.h
@@ -343,9 +343,11 @@ struct mlx5_cmd_mailbox {
 	struct mlx5_cmd_mailbox *next;
 };
 
+struct mlx5_dma_pool_page;
 struct mlx5_buf_list {
 	void		       *buf;
 	dma_addr_t		map;
+	struct mlx5_dma_pool_page *frag_page;
 };
 
 struct mlx5_frag_buf {
-- 
2.44.0


  parent reply	other threads:[~2026-04-28  5:30 UTC|newest]

Thread overview: 5+ messages / expand[flat|nested]  mbox.gz  Atom feed  top
2026-04-28  5:29 [PATCH net-next 0/3] net/mlx5: enable sub-page allocations for mlx5_frag_buf Tariq Toukan
2026-04-28  5:29 ` [PATCH net-next 1/3] net/mlx5: wire frag buf pools lifecycle hooks Tariq Toukan
2026-04-28  5:29 ` [PATCH net-next 2/3] net/mlx5: add frag buf pools create/destroy paths Tariq Toukan
2026-04-28  5:29 ` Tariq Toukan [this message]
2026-04-28 14:47   ` [PATCH net-next 3/3] net/mlx5: use internal dma pools for frag buf alloc Leon Romanovsky

Reply instructions:

You may reply publicly to this message via plain-text email
using any one of the following methods:

* Save the following mbox file, import it into your mail client,
  and reply-to-all from there: mbox

  Avoid top-posting and favor interleaved quoting:
  https://en.wikipedia.org/wiki/Posting_style#Interleaved_style

* Reply using the --to, --cc, and --in-reply-to
  switches of git-send-email(1):

  git send-email \
    --in-reply-to=20260428052920.219201-4-tariqt@nvidia.com \
    --to=tariqt@nvidia.com \
    --cc=andrew+netdev@lunn.ch \
    --cc=davem@davemloft.net \
    --cc=dtatulea@nvidia.com \
    --cc=edumazet@google.com \
    --cc=gal@nvidia.com \
    --cc=kuba@kernel.org \
    --cc=leon@kernel.org \
    --cc=linux-kernel@vger.kernel.org \
    --cc=linux-rdma@vger.kernel.org \
    --cc=mbloch@nvidia.com \
    --cc=moshe@nvidia.com \
    --cc=netdev@vger.kernel.org \
    --cc=noren@nvidia.com \
    --cc=pabeni@redhat.com \
    --cc=saeedm@nvidia.com \
    /path/to/YOUR_REPLY

  https://kernel.org/pub/software/scm/git/docs/git-send-email.html

* If your mail client supports setting the In-Reply-To header
  via mailto: links, try the mailto: link
Be sure your reply has a Subject: header at the top and a blank line before the message body.
This is a public inbox, see mirroring instructions
for how to clone and mirror all data and code used for this inbox