From: Tariq Toukan <tariqt@nvidia.com>
To: Eric Dumazet <edumazet@google.com>,
Jakub Kicinski <kuba@kernel.org>, Paolo Abeni <pabeni@redhat.com>,
Andrew Lunn <andrew+netdev@lunn.ch>,
"David S. Miller" <davem@davemloft.net>
Cc: Saeed Mahameed <saeedm@nvidia.com>,
Leon Romanovsky <leon@kernel.org>,
Tariq Toukan <tariqt@nvidia.com>, Mark Bloch <mbloch@nvidia.com>,
"Alexei Starovoitov" <ast@kernel.org>,
Daniel Borkmann <daniel@iogearbox.net>,
"Jesper Dangaard Brouer" <hawk@kernel.org>,
John Fastabend <john.fastabend@gmail.com>,
<netdev@vger.kernel.org>, <linux-rdma@vger.kernel.org>,
<linux-kernel@vger.kernel.org>, <bpf@vger.kernel.org>,
Gal Pressman <gal@nvidia.com>, Moshe Shemesh <moshe@nvidia.com>,
Dragos Tatulea <dtatulea@nvidia.com>,
Carolina Jubran <cjubran@nvidia.com>
Subject: [PATCH net-next 5/5] net/mlx5e: XDP, Use page fragments for linear data in multibuf-mode
Date: Thu, 19 Mar 2026 09:50:36 +0200 [thread overview]
Message-ID: <20260319075036.24734-6-tariqt@nvidia.com> (raw)
In-Reply-To: <20260319075036.24734-1-tariqt@nvidia.com>
From: Dragos Tatulea <dtatulea@nvidia.com>
Currently in XDP multi-buffer mode for striding rq a whole page is
allocated for the linear part of the XDP buffer. This is wasteful,
especially on systems with larger page sizes.
This change splits the page into fixed sized fragments. The page is
replenished when the maximum number of allowed fragments is reached.
When a fragment is not used, it will be simply recycled on next packet.
This is great for XDP_DROP as the fragment can be recycled for the next
packet. In the most extreme case (XDP_DROP everything), there will be 0
fragments used => only one linear page allocation for the lifetime of
the XDP program.
The previous page_pool size increase was too conservative (doubling the
size) and now there are much fewer allocations (1/8 for a 4K page). So
drop the page_pool size extension altogether when the linear side page
is used.
This small improvement is at most visible for XDP_DROP tests with small
64B packets and a large enough MTU for Striding RQ to be in non-linear
mode:
+----------------------------------------------------------------------+
| System | MTU | baseline | this change | improvement |
|----------------------+------+------------+-------------+-------------|
| 4K page x86_64 [1] | 9000 | 26.30 Mpps | 30.45 Mpps | 15.80 % |
| 64K page aarch64 [2] | 9000 | 15.27 Mpps | 20.10 Mpps | 31.62 % |
+----------------------------------------------------------------------+
[1] Intel Xeon Platinum 8580
[2] ARM Neoverse-N1
Signed-off-by: Dragos Tatulea <dtatulea@nvidia.com>
Reviewed-by: Carolina Jubran <cjubran@nvidia.com>
Signed-off-by: Tariq Toukan <tariqt@nvidia.com>
---
drivers/net/ethernet/mellanox/mlx5/core/en.h | 6 +++
.../net/ethernet/mellanox/mlx5/core/en_main.c | 25 ++++++---
.../net/ethernet/mellanox/mlx5/core/en_rx.c | 54 +++++++++++++++----
3 files changed, 68 insertions(+), 17 deletions(-)
diff --git a/drivers/net/ethernet/mellanox/mlx5/core/en.h b/drivers/net/ethernet/mellanox/mlx5/core/en.h
index 592234780f2b..2270e2e550dd 100644
--- a/drivers/net/ethernet/mellanox/mlx5/core/en.h
+++ b/drivers/net/ethernet/mellanox/mlx5/core/en.h
@@ -82,6 +82,9 @@ struct page_pool;
#define MLX5E_PAGECNT_BIAS_MAX U16_MAX
#define MLX5E_RX_MAX_HEAD (256)
+#define MLX5E_XDP_LOG_MAX_LINEAR_SZ \
+ order_base_2(MLX5_SKB_FRAG_SZ(XDP_PACKET_HEADROOM + MLX5E_RX_MAX_HEAD))
+
#define MLX5E_SHAMPO_LOG_HEADER_ENTRY_SIZE (8)
#define MLX5E_SHAMPO_WQ_HEADER_PER_PAGE \
(PAGE_SIZE >> MLX5E_SHAMPO_LOG_HEADER_ENTRY_SIZE)
@@ -596,6 +599,7 @@ struct mlx5e_mpw_info {
struct mlx5e_mpw_linear_info {
struct mlx5e_frag_page frag_page;
+ u16 max_frags;
};
#define MLX5E_MAX_RX_FRAGS 4
@@ -1081,6 +1085,8 @@ bool mlx5e_reset_rx_moderation(struct dim_cq_moder *cq_moder, u8 cq_period_mode,
bool mlx5e_reset_rx_channels_moderation(struct mlx5e_channels *chs, u8 cq_period_mode,
bool dim_enabled, bool keep_dim_state);
+void mlx5e_mpwqe_dealloc_linear_page(struct mlx5e_rq *rq);
+
struct mlx5e_sq_param;
int mlx5e_open_xdpsq(struct mlx5e_channel *c, struct mlx5e_params *params,
struct mlx5e_sq_param *param, struct xsk_buff_pool *xsk_pool,
diff --git a/drivers/net/ethernet/mellanox/mlx5/core/en_main.c b/drivers/net/ethernet/mellanox/mlx5/core/en_main.c
index 8b3c82f6f038..b376abc561fd 100644
--- a/drivers/net/ethernet/mellanox/mlx5/core/en_main.c
+++ b/drivers/net/ethernet/mellanox/mlx5/core/en_main.c
@@ -371,11 +371,11 @@ static int mlx5e_rq_alloc_mpwqe_info(struct mlx5e_rq *rq, int node)
static int mlx5e_rq_alloc_mpwqe_linear_info(struct mlx5e_rq *rq, int node,
struct mlx5e_params *params,
- struct mlx5e_rq_opt_param *rqo,
- u32 *pool_size)
+ struct mlx5e_rq_opt_param *rqo)
{
struct mlx5_core_dev *mdev = rq->mdev;
struct mlx5e_mpw_linear_info *li;
+ u32 linear_frag_count;
if (mlx5e_rx_mpwqe_is_linear_skb(mdev, params, rqo) ||
!params->xdp_prog)
@@ -385,10 +385,22 @@ static int mlx5e_rq_alloc_mpwqe_linear_info(struct mlx5e_rq *rq, int node,
if (!li)
return -ENOMEM;
+ linear_frag_count =
+ BIT(rq->mpwqe.page_shift - MLX5E_XDP_LOG_MAX_LINEAR_SZ);
+ if (linear_frag_count > U16_MAX) {
+ netdev_warn(rq->netdev,
+ "rq %d: linear_frag_count (%u) larger than expected (%u), page_shift: %u, log_max_linear_sz: %u\n",
+ rq->ix, linear_frag_count, U16_MAX,
+ rq->mpwqe.page_shift, MLX5E_XDP_LOG_MAX_LINEAR_SZ);
+ kvfree(li);
+ return -EINVAL;
+ }
+
+ li->max_frags = linear_frag_count;
rq->mpwqe.linear_info = li;
- /* additional page per packet for the linear part */
- *pool_size *= 2;
+ /* Set to max to force allocation on first run. */
+ li->frag_page.frags = li->max_frags;
return 0;
}
@@ -955,8 +967,7 @@ static int mlx5e_alloc_rq(struct mlx5e_params *params,
if (err)
goto err_rq_mkey;
- err = mlx5e_rq_alloc_mpwqe_linear_info(rq, node, params, rqo,
- &pool_size);
+ err = mlx5e_rq_alloc_mpwqe_linear_info(rq, node, params, rqo);
if (err)
goto err_free_mpwqe_info;
@@ -1347,6 +1358,8 @@ void mlx5e_free_rx_descs(struct mlx5e_rq *rq)
mlx5_wq_ll_pop(wq, wqe_ix_be,
&wqe->next.next_wqe_index);
}
+
+ mlx5e_mpwqe_dealloc_linear_page(rq);
} else {
struct mlx5_wq_cyc *wq = &rq->wqe.wq;
u16 missing = mlx5_wq_cyc_missing(wq);
diff --git a/drivers/net/ethernet/mellanox/mlx5/core/en_rx.c b/drivers/net/ethernet/mellanox/mlx5/core/en_rx.c
index feb042d84b8e..2ac38536afe9 100644
--- a/drivers/net/ethernet/mellanox/mlx5/core/en_rx.c
+++ b/drivers/net/ethernet/mellanox/mlx5/core/en_rx.c
@@ -300,6 +300,35 @@ static void mlx5e_page_release_fragmented(struct page_pool *pp,
page_pool_put_unrefed_netmem(pp, netmem, -1, true);
}
+static int mlx5e_mpwqe_linear_page_refill(struct mlx5e_rq *rq)
+{
+ struct mlx5e_mpw_linear_info *li = rq->mpwqe.linear_info;
+
+ if (likely(li->frag_page.frags < li->max_frags))
+ return 0;
+
+ if (likely(li->frag_page.netmem)) {
+ mlx5e_page_release_fragmented(rq->page_pool, &li->frag_page);
+ li->frag_page.netmem = 0;
+ }
+
+ return mlx5e_page_alloc_fragmented(rq->page_pool, &li->frag_page);
+}
+
+static void *mlx5e_mpwqe_get_linear_page_frag(struct mlx5e_rq *rq)
+{
+ struct mlx5e_mpw_linear_info *li = rq->mpwqe.linear_info;
+ u32 frag_offset;
+
+ if (unlikely(mlx5e_mpwqe_linear_page_refill(rq)))
+ return NULL;
+
+ frag_offset = li->frag_page.frags << MLX5E_XDP_LOG_MAX_LINEAR_SZ;
+ WARN_ON(frag_offset >= BIT(rq->mpwqe.page_shift));
+
+ return netmem_address(li->frag_page.netmem) + frag_offset;
+}
+
static inline int mlx5e_get_rx_frag(struct mlx5e_rq *rq,
struct mlx5e_wqe_frag_info *frag)
{
@@ -702,6 +731,16 @@ static void mlx5e_dealloc_rx_mpwqe(struct mlx5e_rq *rq, u16 ix)
bitmap_fill(wi->skip_release_bitmap, rq->mpwqe.pages_per_wqe);
}
+void mlx5e_mpwqe_dealloc_linear_page(struct mlx5e_rq *rq)
+{
+ struct mlx5e_mpw_linear_info *li = rq->mpwqe.linear_info;
+
+ if (!li || !li->frag_page.netmem)
+ return;
+
+ mlx5e_page_release_fragmented(rq->page_pool, &li->frag_page);
+}
+
INDIRECT_CALLABLE_SCOPE bool mlx5e_post_rx_wqes(struct mlx5e_rq *rq)
{
struct mlx5_wq_cyc *wq = &rq->wqe.wq;
@@ -1899,18 +1938,17 @@ mlx5e_skb_from_cqe_mpwrq_nonlinear(struct mlx5e_rq *rq, struct mlx5e_mpw_info *w
/* area for bpf_xdp_[store|load]_bytes */
net_prefetchw(netmem_address(frag_page->netmem) + frag_offset);
- linear_page = &rq->mpwqe.linear_info->frag_page;
- if (unlikely(mlx5e_page_alloc_fragmented(rq->page_pool,
- linear_page))) {
+ va = mlx5e_mpwqe_get_linear_page_frag(rq);
+ if (!va) {
rq->stats->buff_alloc_err++;
return NULL;
}
- va = netmem_address(linear_page->netmem);
net_prefetchw(va); /* xdp_frame data area */
linear_hr = XDP_PACKET_HEADROOM;
linear_data_len = 0;
linear_frame_sz = MLX5_SKB_FRAG_SZ(linear_hr + MLX5E_RX_MAX_HEAD);
+ linear_page = &rq->mpwqe.linear_info->frag_page;
} else {
skb = napi_alloc_skb(rq->cq.napi,
ALIGN(MLX5E_RX_MAX_HEAD, sizeof(long)));
@@ -1971,8 +2009,6 @@ mlx5e_skb_from_cqe_mpwrq_nonlinear(struct mlx5e_rq *rq, struct mlx5e_mpw_info *w
linear_page->frags++;
}
- mlx5e_page_release_fragmented(rq->page_pool,
- linear_page);
return NULL; /* page/packet was consumed by XDP */
}
@@ -1989,15 +2025,11 @@ mlx5e_skb_from_cqe_mpwrq_nonlinear(struct mlx5e_rq *rq, struct mlx5e_mpw_info *w
rq, mxbuf->xdp.data_hard_start, linear_frame_sz,
mxbuf->xdp.data - mxbuf->xdp.data_hard_start, len,
mxbuf->xdp.data - mxbuf->xdp.data_meta);
- if (unlikely(!skb)) {
- mlx5e_page_release_fragmented(rq->page_pool,
- linear_page);
+ if (unlikely(!skb))
return NULL;
- }
skb_mark_for_recycle(skb);
linear_page->frags++;
- mlx5e_page_release_fragmented(rq->page_pool, linear_page);
if (xdp_buff_has_frags(&mxbuf->xdp)) {
struct mlx5e_frag_page *pagep;
--
2.44.0
next prev parent reply other threads:[~2026-03-19 7:52 UTC|newest]
Thread overview: 8+ messages / expand[flat|nested] mbox.gz Atom feed top
2026-03-19 7:50 [PATCH net-next 0/5] net/mlx5e: XDP, Add support for multi-packet per page Tariq Toukan
2026-03-19 7:50 ` [PATCH net-next 1/5] net/mlx5e: XSK, Increase size for chunk_size param Tariq Toukan
2026-03-19 7:50 ` [PATCH net-next 2/5] net/mlx5e: XDP, Improve dma address calculation of linear part for XDP_TX Tariq Toukan
2026-03-19 7:50 ` [PATCH net-next 3/5] net/mlx5e: XDP, Remove stride size limitation Tariq Toukan
2026-03-19 7:50 ` [PATCH net-next 4/5] net/mlx5e: XDP, Use a single linear page per rq Tariq Toukan
2026-03-19 7:50 ` Tariq Toukan [this message]
2026-03-24 2:42 ` [PATCH net-next 5/5] net/mlx5e: XDP, Use page fragments for linear data in multibuf-mode Jakub Kicinski
2026-03-24 8:50 ` Dragos Tatulea
Reply instructions:
You may reply publicly to this message via plain-text email
using any one of the following methods:
* Save the following mbox file, import it into your mail client,
and reply-to-all from there: mbox
Avoid top-posting and favor interleaved quoting:
https://en.wikipedia.org/wiki/Posting_style#Interleaved_style
* Reply using the --to, --cc, and --in-reply-to
switches of git-send-email(1):
git send-email \
--in-reply-to=20260319075036.24734-6-tariqt@nvidia.com \
--to=tariqt@nvidia.com \
--cc=andrew+netdev@lunn.ch \
--cc=ast@kernel.org \
--cc=bpf@vger.kernel.org \
--cc=cjubran@nvidia.com \
--cc=daniel@iogearbox.net \
--cc=davem@davemloft.net \
--cc=dtatulea@nvidia.com \
--cc=edumazet@google.com \
--cc=gal@nvidia.com \
--cc=hawk@kernel.org \
--cc=john.fastabend@gmail.com \
--cc=kuba@kernel.org \
--cc=leon@kernel.org \
--cc=linux-kernel@vger.kernel.org \
--cc=linux-rdma@vger.kernel.org \
--cc=mbloch@nvidia.com \
--cc=moshe@nvidia.com \
--cc=netdev@vger.kernel.org \
--cc=pabeni@redhat.com \
--cc=saeedm@nvidia.com \
/path/to/YOUR_REPLY
https://kernel.org/pub/software/scm/git/docs/git-send-email.html
* If your mail client supports setting the In-Reply-To header
via mailto: links, try the mailto: link
Be sure your reply has a Subject: header at the top and a blank line
before the message body.
This is a public inbox, see mirroring instructions
for how to clone and mirror all data and code used for this inbox