From: Tariq Toukan <tariqt@nvidia.com>
To: Eric Dumazet <edumazet@google.com>,
Jakub Kicinski <kuba@kernel.org>, Paolo Abeni <pabeni@redhat.com>,
Andrew Lunn <andrew+netdev@lunn.ch>,
"David S. Miller" <davem@davemloft.net>
Cc: Saeed Mahameed <saeedm@nvidia.com>,
Leon Romanovsky <leon@kernel.org>,
Tariq Toukan <tariqt@nvidia.com>, Mark Bloch <mbloch@nvidia.com>,
<netdev@vger.kernel.org>, <linux-rdma@vger.kernel.org>,
<linux-kernel@vger.kernel.org>, Gal Pressman <gal@nvidia.com>,
Dragos Tatulea <dtatulea@nvidia.com>,
Moshe Shemesh <moshe@nvidia.com>, Nimrod Oren <noren@nvidia.com>
Subject: [PATCH net-next 2/3] net/mlx5: add frag buf pools create/destroy paths
Date: Tue, 28 Apr 2026 08:29:19 +0300 [thread overview]
Message-ID: <20260428052920.219201-3-tariqt@nvidia.com> (raw)
In-Reply-To: <20260428052920.219201-1-tariqt@nvidia.com>
From: Nimrod Oren <noren@nvidia.com>
Introduce mlx5 DMA pool and pool-page data structures, and add the
creation and teardown paths.
Each NUMA node owns a set of mlx5_dma_pool instances, each one with a
different block size. The sizes are defined as all powers of two
starting from MLX5_ADAPTER_PAGE_SHIFT and up to PAGE_SHIFT. Since
mlx5_frag_bufs are used to back objects whose sizes are encoded relative
to MLX5_ADAPTER_PAGE_SHIFT, a smaller block_shift value cannot be used.
Requests larger than PAGE_SIZE continue to be handled as page-sized
fragments, as in the existing frag-buf allocation model.
Signed-off-by: Nimrod Oren <noren@nvidia.com>
Signed-off-by: Tariq Toukan <tariqt@nvidia.com>
---
.../net/ethernet/mellanox/mlx5/core/alloc.c | 136 +++++++++++++++++-
include/linux/mlx5/driver.h | 7 +-
2 files changed, 140 insertions(+), 3 deletions(-)
diff --git a/drivers/net/ethernet/mellanox/mlx5/core/alloc.c b/drivers/net/ethernet/mellanox/mlx5/core/alloc.c
index cebb3559d2c9..918cf027bcbc 100644
--- a/drivers/net/ethernet/mellanox/mlx5/core/alloc.c
+++ b/drivers/net/ethernet/mellanox/mlx5/core/alloc.c
@@ -37,10 +37,15 @@
#include <linux/bitmap.h>
#include <linux/dma-mapping.h>
#include <linux/vmalloc.h>
+#include <linux/nodemask.h>
#include <linux/mlx5/driver.h>
#include "mlx5_core.h"
+#define MLX5_FRAG_BUF_POOL_MIN_BLOCK_SHIFT MLX5_ADAPTER_PAGE_SHIFT
+#define MLX5_FRAG_BUF_POOLS_NUM \
+ (PAGE_SHIFT - MLX5_FRAG_BUF_POOL_MIN_BLOCK_SHIFT + 1)
+
struct mlx5_db_pgdir {
struct list_head list;
unsigned long *bitmap;
@@ -48,6 +53,27 @@ struct mlx5_db_pgdir {
dma_addr_t db_dma;
};
+struct mlx5_dma_pool {
+ /* Protects page_list and per-page allocation bitmaps. */
+ struct mutex lock;
+ struct list_head page_list;
+ struct mlx5_core_dev *dev;
+ int node;
+ u8 block_shift;
+};
+
+struct mlx5_dma_pool_page {
+ struct mlx5_dma_pool *pool;
+ struct list_head pool_link;
+ unsigned long *bitmap;
+ void *buf;
+ dma_addr_t dma;
+};
+
+struct mlx5_frag_buf_node_pools {
+ struct mlx5_dma_pool *pools[MLX5_FRAG_BUF_POOLS_NUM];
+};
+
/* Handling for queue buffers -- we allocate a bunch of memory and
* register it in a memory region at HCA virtual address 0.
*/
@@ -72,13 +98,121 @@ static void *mlx5_dma_zalloc_coherent_node(struct mlx5_core_dev *dev,
}
/* Implemented later in the series */
+static void mlx5_dma_pool_page_free(struct mlx5_core_dev *dev,
+ struct mlx5_dma_pool_page *page)
+{
+}
+
+static void mlx5_dma_pool_destroy(struct mlx5_dma_pool *pool)
+{
+ struct list_head *page_list = &pool->page_list;
+ struct mlx5_dma_pool_page *page, *tmp;
+
+ if (WARN(!list_empty(page_list),
+ "mlx5 dma pool destroy with non-empty pool: block_shift=%u\n",
+ pool->block_shift))
+ list_for_each_entry_safe(page, tmp, page_list, pool_link) {
+ list_del(&page->pool_link);
+ mlx5_dma_pool_page_free(pool->dev, page);
+ }
+
+ mutex_destroy(&pool->lock);
+ kfree(pool);
+}
+
+static struct mlx5_dma_pool *mlx5_dma_pool_create(struct mlx5_core_dev *dev,
+ int node, u8 block_shift)
+{
+ struct mlx5_dma_pool *pool;
+
+ if (WARN_ONCE(block_shift > PAGE_SHIFT,
+ "mlx5 dma pool invalid block_shift: %u (max %d)\n",
+ block_shift, PAGE_SHIFT))
+ return NULL;
+
+ pool = kzalloc_obj(*pool);
+ if (!pool)
+ return NULL;
+
+ INIT_LIST_HEAD(&pool->page_list);
+ mutex_init(&pool->lock);
+ pool->dev = dev;
+ pool->node = node;
+ pool->block_shift = block_shift;
+ return pool;
+}
+
+static void
+mlx5_frag_buf_node_pools_destroy(struct mlx5_frag_buf_node_pools *node_pools)
+{
+ for (int i = 0; i < MLX5_FRAG_BUF_POOLS_NUM; i++)
+ if (node_pools->pools[i])
+ mlx5_dma_pool_destroy(node_pools->pools[i]);
+ kfree(node_pools);
+}
+
+static struct mlx5_frag_buf_node_pools *
+mlx5_frag_buf_node_pools_create(struct mlx5_core_dev *dev, int node)
+{
+ struct mlx5_frag_buf_node_pools *node_pools;
+
+ node_pools = kzalloc_obj(*node_pools);
+ if (!node_pools)
+ return NULL;
+
+ for (int i = 0; i < MLX5_FRAG_BUF_POOLS_NUM; i++) {
+ u8 block_shift = MLX5_FRAG_BUF_POOL_MIN_BLOCK_SHIFT + i;
+
+ node_pools->pools[i] = mlx5_dma_pool_create(dev, node,
+ block_shift);
+ if (!node_pools->pools[i]) {
+ mlx5_frag_buf_node_pools_destroy(node_pools);
+ return NULL;
+ }
+ }
+
+ return node_pools;
+}
+
void mlx5_frag_buf_pools_cleanup(struct mlx5_core_dev *dev)
{
+ struct mlx5_priv *priv = &dev->priv;
+ int node;
+
+ for_each_node_state(node, N_POSSIBLE) {
+ struct mlx5_frag_buf_node_pools *node_pools;
+
+ node_pools = priv->frag_buf_node_pools[node];
+ if (!node_pools)
+ continue;
+ mlx5_frag_buf_node_pools_destroy(node_pools);
+ }
+
+ kfree(priv->frag_buf_node_pools);
+ priv->frag_buf_node_pools = NULL;
}
-/* Implemented later in the series */
int mlx5_frag_buf_pools_init(struct mlx5_core_dev *dev)
{
+ struct mlx5_priv *priv = &dev->priv;
+ int node;
+
+ priv->frag_buf_node_pools = kzalloc_objs(*priv->frag_buf_node_pools,
+ nr_node_ids);
+ if (!priv->frag_buf_node_pools)
+ return -ENOMEM;
+
+ for_each_node_state(node, N_POSSIBLE) {
+ struct mlx5_frag_buf_node_pools *node_pools;
+
+ node_pools = mlx5_frag_buf_node_pools_create(dev, node);
+ if (!node_pools) {
+ mlx5_frag_buf_pools_cleanup(dev);
+ return -ENOMEM;
+ }
+ priv->frag_buf_node_pools[node] = node_pools;
+ }
+
return 0;
}
diff --git a/include/linux/mlx5/driver.h b/include/linux/mlx5/driver.h
index 04b96c5abb57..71f7615ab553 100644
--- a/include/linux/mlx5/driver.h
+++ b/include/linux/mlx5/driver.h
@@ -558,6 +558,7 @@ enum mlx5_func_type {
MLX5_FUNC_TYPE_NUM,
};
+struct mlx5_frag_buf_node_pools;
struct mlx5_ft_pool;
struct mlx5_priv {
/* IRQ table valid only for real pci devices PF or VF */
@@ -581,14 +582,16 @@ struct mlx5_priv {
struct mlx5_debugfs_entries dbg;
- /* start: alloc staff */
+ /* start: alloc stuff */
/* protect buffer allocation according to numa node */
struct mutex alloc_mutex;
int numa_node;
struct mutex pgdir_mutex;
struct list_head pgdir_list;
- /* end: alloc staff */
+
+ struct mlx5_frag_buf_node_pools **frag_buf_node_pools;
+ /* end: alloc stuff */
struct mlx5_adev **adev;
int adev_idx;
--
2.44.0
next prev parent reply other threads:[~2026-04-28 5:30 UTC|newest]
Thread overview: 5+ messages / expand[flat|nested] mbox.gz Atom feed top
2026-04-28 5:29 [PATCH net-next 0/3] net/mlx5: enable sub-page allocations for mlx5_frag_buf Tariq Toukan
2026-04-28 5:29 ` [PATCH net-next 1/3] net/mlx5: wire frag buf pools lifecycle hooks Tariq Toukan
2026-04-28 5:29 ` Tariq Toukan [this message]
2026-04-28 5:29 ` [PATCH net-next 3/3] net/mlx5: use internal dma pools for frag buf alloc Tariq Toukan
2026-04-28 14:47 ` Leon Romanovsky
Reply instructions:
You may reply publicly to this message via plain-text email
using any one of the following methods:
* Save the following mbox file, import it into your mail client,
and reply-to-all from there: mbox
Avoid top-posting and favor interleaved quoting:
https://en.wikipedia.org/wiki/Posting_style#Interleaved_style
* Reply using the --to, --cc, and --in-reply-to
switches of git-send-email(1):
git send-email \
--in-reply-to=20260428052920.219201-3-tariqt@nvidia.com \
--to=tariqt@nvidia.com \
--cc=andrew+netdev@lunn.ch \
--cc=davem@davemloft.net \
--cc=dtatulea@nvidia.com \
--cc=edumazet@google.com \
--cc=gal@nvidia.com \
--cc=kuba@kernel.org \
--cc=leon@kernel.org \
--cc=linux-kernel@vger.kernel.org \
--cc=linux-rdma@vger.kernel.org \
--cc=mbloch@nvidia.com \
--cc=moshe@nvidia.com \
--cc=netdev@vger.kernel.org \
--cc=noren@nvidia.com \
--cc=pabeni@redhat.com \
--cc=saeedm@nvidia.com \
/path/to/YOUR_REPLY
https://kernel.org/pub/software/scm/git/docs/git-send-email.html
* If your mail client supports setting the In-Reply-To header
via mailto: links, try the mailto: link
Be sure your reply has a Subject: header at the top and a blank line
before the message body.
This is a public inbox, see mirroring instructions
for how to clone and mirror all data and code used for this inbox