From: Tariq Toukan <tariqt@nvidia.com>
To: Eric Dumazet <edumazet@google.com>,
Jakub Kicinski <kuba@kernel.org>, Paolo Abeni <pabeni@redhat.com>,
Andrew Lunn <andrew+netdev@lunn.ch>,
"David S. Miller" <davem@davemloft.net>
Cc: Saeed Mahameed <saeedm@nvidia.com>,
Leon Romanovsky <leon@kernel.org>,
Tariq Toukan <tariqt@nvidia.com>, Mark Bloch <mbloch@nvidia.com>,
<netdev@vger.kernel.org>, <linux-rdma@vger.kernel.org>,
<linux-kernel@vger.kernel.org>, Gal Pressman <gal@nvidia.com>,
Dragos Tatulea <dtatulea@nvidia.com>,
Moshe Shemesh <moshe@nvidia.com>, Nimrod Oren <noren@nvidia.com>
Subject: [PATCH net-next V2 2/3] net/mlx5: add frag buf pools create/destroy paths
Date: Wed, 29 Apr 2026 23:14:28 +0300 [thread overview]
Message-ID: <20260429201429.223809-3-tariqt@nvidia.com> (raw)
In-Reply-To: <20260429201429.223809-1-tariqt@nvidia.com>
From: Nimrod Oren <noren@nvidia.com>
Introduce mlx5 DMA pool and pool-page data structures, and add the
creation and teardown paths.
Each NUMA node owns a set of mlx5_dma_pool instances, each one with a
different block size. The sizes are defined as all powers of two
starting from MLX5_ADAPTER_PAGE_SHIFT and up to PAGE_SHIFT. Since
mlx5_frag_bufs are used to back objects whose sizes are encoded relative
to MLX5_ADAPTER_PAGE_SHIFT, a smaller block_shift value cannot be used.
Requests larger than PAGE_SIZE continue to be handled as page-sized
fragments, as in the existing frag-buf allocation model.
Signed-off-by: Nimrod Oren <noren@nvidia.com>
Signed-off-by: Tariq Toukan <tariqt@nvidia.com>
---
.../net/ethernet/mellanox/mlx5/core/alloc.c | 116 +++++++++++++++++-
include/linux/mlx5/driver.h | 7 +-
2 files changed, 119 insertions(+), 4 deletions(-)
diff --git a/drivers/net/ethernet/mellanox/mlx5/core/alloc.c b/drivers/net/ethernet/mellanox/mlx5/core/alloc.c
index cebb3559d2c9..fcc859c5f810 100644
--- a/drivers/net/ethernet/mellanox/mlx5/core/alloc.c
+++ b/drivers/net/ethernet/mellanox/mlx5/core/alloc.c
@@ -37,10 +37,15 @@
#include <linux/bitmap.h>
#include <linux/dma-mapping.h>
#include <linux/vmalloc.h>
+#include <linux/nodemask.h>
#include <linux/mlx5/driver.h>
#include "mlx5_core.h"
+#define MLX5_FRAG_BUF_POOL_MIN_BLOCK_SHIFT MLX5_ADAPTER_PAGE_SHIFT
+#define MLX5_FRAG_BUF_POOLS_NUM \
+ (PAGE_SHIFT - MLX5_FRAG_BUF_POOL_MIN_BLOCK_SHIFT + 1)
+
struct mlx5_db_pgdir {
struct list_head list;
unsigned long *bitmap;
@@ -48,6 +53,27 @@ struct mlx5_db_pgdir {
dma_addr_t db_dma;
};
+struct mlx5_dma_pool {
+ /* Protects page_list and per-page allocation bitmaps. */
+ struct mutex lock;
+ struct list_head page_list;
+ struct mlx5_core_dev *dev;
+ int node;
+ u8 block_shift;
+};
+
+struct mlx5_dma_pool_page {
+ struct mlx5_dma_pool *pool;
+ struct list_head pool_link;
+ unsigned long *bitmap;
+ void *buf;
+ dma_addr_t dma;
+};
+
+struct mlx5_frag_buf_node_pools {
+ struct mlx5_dma_pool *pools[MLX5_FRAG_BUF_POOLS_NUM];
+};
+
/* Handling for queue buffers -- we allocate a bunch of memory and
* register it in a memory region at HCA virtual address 0.
*/
@@ -71,14 +97,100 @@ static void *mlx5_dma_zalloc_coherent_node(struct mlx5_core_dev *dev,
return cpu_handle;
}
-/* Implemented later in the series */
+static void mlx5_dma_pool_destroy(struct mlx5_dma_pool *pool)
+{
+ mutex_destroy(&pool->lock);
+ kfree(pool);
+}
+
+static struct mlx5_dma_pool *mlx5_dma_pool_create(struct mlx5_core_dev *dev,
+ int node, u8 block_shift)
+{
+ struct mlx5_dma_pool *pool;
+
+ pool = kzalloc_obj(*pool);
+ if (!pool)
+ return NULL;
+
+ INIT_LIST_HEAD(&pool->page_list);
+ mutex_init(&pool->lock);
+ pool->dev = dev;
+ pool->node = node;
+ pool->block_shift = block_shift;
+ return pool;
+}
+
+static void
+mlx5_frag_buf_node_pools_destroy(struct mlx5_frag_buf_node_pools *node_pools)
+{
+ for (int i = 0; i < MLX5_FRAG_BUF_POOLS_NUM; i++)
+ if (node_pools->pools[i])
+ mlx5_dma_pool_destroy(node_pools->pools[i]);
+ kfree(node_pools);
+}
+
+static struct mlx5_frag_buf_node_pools *
+mlx5_frag_buf_node_pools_create(struct mlx5_core_dev *dev, int node)
+{
+ struct mlx5_frag_buf_node_pools *node_pools;
+
+ node_pools = kzalloc_obj(*node_pools);
+ if (!node_pools)
+ return NULL;
+
+ for (int i = 0; i < MLX5_FRAG_BUF_POOLS_NUM; i++) {
+ u8 block_shift = MLX5_FRAG_BUF_POOL_MIN_BLOCK_SHIFT + i;
+
+ node_pools->pools[i] = mlx5_dma_pool_create(dev, node,
+ block_shift);
+ if (!node_pools->pools[i]) {
+ mlx5_frag_buf_node_pools_destroy(node_pools);
+ return NULL;
+ }
+ }
+
+ return node_pools;
+}
+
void mlx5_frag_buf_pools_cleanup(struct mlx5_core_dev *dev)
{
+ struct mlx5_priv *priv = &dev->priv;
+ int node;
+
+ for_each_node_state(node, N_POSSIBLE) {
+ struct mlx5_frag_buf_node_pools *node_pools;
+
+ node_pools = priv->frag_buf_node_pools[node];
+ if (!node_pools)
+ continue;
+ mlx5_frag_buf_node_pools_destroy(node_pools);
+ }
+
+ kfree(priv->frag_buf_node_pools);
+ priv->frag_buf_node_pools = NULL;
}
-/* Implemented later in the series */
int mlx5_frag_buf_pools_init(struct mlx5_core_dev *dev)
{
+ struct mlx5_priv *priv = &dev->priv;
+ int node;
+
+ priv->frag_buf_node_pools = kzalloc_objs(*priv->frag_buf_node_pools,
+ nr_node_ids);
+ if (!priv->frag_buf_node_pools)
+ return -ENOMEM;
+
+ for_each_node_state(node, N_POSSIBLE) {
+ struct mlx5_frag_buf_node_pools *node_pools;
+
+ node_pools = mlx5_frag_buf_node_pools_create(dev, node);
+ if (!node_pools) {
+ mlx5_frag_buf_pools_cleanup(dev);
+ return -ENOMEM;
+ }
+ priv->frag_buf_node_pools[node] = node_pools;
+ }
+
return 0;
}
diff --git a/include/linux/mlx5/driver.h b/include/linux/mlx5/driver.h
index 04b96c5abb57..71f7615ab553 100644
--- a/include/linux/mlx5/driver.h
+++ b/include/linux/mlx5/driver.h
@@ -558,6 +558,7 @@ enum mlx5_func_type {
MLX5_FUNC_TYPE_NUM,
};
+struct mlx5_frag_buf_node_pools;
struct mlx5_ft_pool;
struct mlx5_priv {
/* IRQ table valid only for real pci devices PF or VF */
@@ -581,14 +582,16 @@ struct mlx5_priv {
struct mlx5_debugfs_entries dbg;
- /* start: alloc staff */
+ /* start: alloc stuff */
/* protect buffer allocation according to numa node */
struct mutex alloc_mutex;
int numa_node;
struct mutex pgdir_mutex;
struct list_head pgdir_list;
- /* end: alloc staff */
+
+ struct mlx5_frag_buf_node_pools **frag_buf_node_pools;
+ /* end: alloc stuff */
struct mlx5_adev **adev;
int adev_idx;
--
2.44.0
next prev parent reply other threads:[~2026-04-29 20:16 UTC|newest]
Thread overview: 5+ messages / expand[flat|nested] mbox.gz Atom feed top
2026-04-29 20:14 [PATCH net-next V2 0/3] net/mlx5: enable sub-page allocations for mlx5_frag_buf Tariq Toukan
2026-04-29 20:14 ` [PATCH net-next V2 1/3] net/mlx5: wire frag buf pools lifecycle hooks Tariq Toukan
2026-04-29 20:14 ` Tariq Toukan [this message]
2026-04-29 20:14 ` [PATCH net-next V2 3/3] net/mlx5: use internal dma pools for frag buf alloc Tariq Toukan
2026-05-02 2:20 ` [PATCH net-next V2 0/3] net/mlx5: enable sub-page allocations for mlx5_frag_buf patchwork-bot+netdevbpf
Reply instructions:
You may reply publicly to this message via plain-text email
using any one of the following methods:
* Save the following mbox file, import it into your mail client,
and reply-to-all from there: mbox
Avoid top-posting and favor interleaved quoting:
https://en.wikipedia.org/wiki/Posting_style#Interleaved_style
* Reply using the --to, --cc, and --in-reply-to
switches of git-send-email(1):
git send-email \
--in-reply-to=20260429201429.223809-3-tariqt@nvidia.com \
--to=tariqt@nvidia.com \
--cc=andrew+netdev@lunn.ch \
--cc=davem@davemloft.net \
--cc=dtatulea@nvidia.com \
--cc=edumazet@google.com \
--cc=gal@nvidia.com \
--cc=kuba@kernel.org \
--cc=leon@kernel.org \
--cc=linux-kernel@vger.kernel.org \
--cc=linux-rdma@vger.kernel.org \
--cc=mbloch@nvidia.com \
--cc=moshe@nvidia.com \
--cc=netdev@vger.kernel.org \
--cc=noren@nvidia.com \
--cc=pabeni@redhat.com \
--cc=saeedm@nvidia.com \
/path/to/YOUR_REPLY
https://kernel.org/pub/software/scm/git/docs/git-send-email.html
* If your mail client supports setting the In-Reply-To header
via mailto: links, try the mailto: link
Be sure your reply has a Subject: header at the top and a blank line
before the message body.
This is a public inbox, see mirroring instructions
for how to clone and mirror all data and code used for this inbox