All of lore.kernel.org
 help / color / mirror / Atom feed
From: Jakub Kicinski <kuba@kernel.org>
To: tariqt@nvidia.com
Cc: Jakub Kicinski <kuba@kernel.org>,
	leon@kernel.org, linux-kernel@vger.kernel.org,
	netdev@vger.kernel.org, moshe@nvidia.com, daniel@iogearbox.net,
	edumazet@google.com, andrew+netdev@lunn.ch, hawk@kernel.org,
	dtatulea@nvidia.com, ast@kernel.org, mbloch@nvidia.com,
	john.fastabend@gmail.com, bpf@vger.kernel.org,
	cjubran@nvidia.com, linux-rdma@vger.kernel.org,
	davem@davemloft.net, pabeni@redhat.com, gal@nvidia.com,
	saeedm@nvidia.com
Subject: Re: [PATCH net-next 5/5] net/mlx5e: XDP, Use page fragments for linear data in multibuf-mode
Date: Mon, 23 Mar 2026 19:42:34 -0700	[thread overview]
Message-ID: <20260324024235.929875-1-kuba@kernel.org> (raw)
In-Reply-To: <20260319075036.24734-6-tariqt@nvidia.com>

This is an AI-generated review of your patch. The human sending this
email has considered the AI review valid, or at least plausible.

For local reproduction steps see https://netdev-ai.bots.linux.dev/ai-local.html
---
net/mlx5e: XDP, Use page fragments for linear data in multibuf-mode

This commit optimizes XDP multi-buffer mode by splitting pages into fixed-
size fragments instead of allocating whole pages for the linear part of XDP
buffers. This reduces allocations significantly, especially on systems with
larger page sizes, improving performance by 15.80% on 4K pages and 31.62%
on 64K pages for XDP_DROP tests.

> diff --git a/drivers/net/ethernet/mellanox/mlx5/core/en.h b/drivers/net/ethernet/mellanox/mlx5/core/en.h
> index 592234780f2b..2270e2e550dd 100644
> --- a/drivers/net/ethernet/mellanox/mlx5/core/en.h
> +++ b/drivers/net/ethernet/mellanox/mlx5/core/en.h

[ ... ]

> @@ -1081,6 +1085,8 @@ bool mlx5e_reset_rx_moderation(struct dim_cq_moder *cq_moder, u8 cq_period_mode,
>  bool mlx5e_reset_rx_channels_moderation(struct mlx5e_channels *chs, u8 cq_period_mode,
>  					bool dim_enabled, bool keep_dim_state);
>
> +void mlx5e_mpwqe_dealloc_linear_page(struct mlx5e_rq *rq);
> +
>  struct mlx5e_sq_param;
>  int mlx5e_open_xdpsq(struct mlx5e_channel *c, struct mlx5e_params *params,
>  		     struct mlx5e_sq_param *param, struct xsk_buff_pool *xsk_pool,

[ ... ]

> diff --git a/drivers/net/ethernet/mellanox/mlx5/core/en_main.c b/drivers/net/ethernet/mellanox/mlx5/core/en_main.c
> index 8b3c82f6f038..b376abc561fd 100644
> --- a/drivers/net/ethernet/mellanox/mlx5/core/en_main.c
> +++ b/drivers/net/ethernet/mellanox/mlx5/core/en_main.c

[ ... ]

> @@ -385,10 +385,22 @@ static int mlx5e_rq_alloc_mpwqe_linear_info(struct mlx5e_rq *rq, int node,
>  	if (!li)
>  		return -ENOMEM;
>
> +	linear_frag_count =
> +		BIT(rq->mpwqe.page_shift - MLX5E_XDP_LOG_MAX_LINEAR_SZ);
> +	if (linear_frag_count > U16_MAX) {
> +		netdev_warn(rq->netdev,
> +			    "rq %d: linear_frag_count (%u) larger than expected (%u), page_shift: %u, log_max_linear_sz: %u\n",
> +			    rq->ix, linear_frag_count, U16_MAX,
> +			    rq->mpwqe.page_shift, MLX5E_XDP_LOG_MAX_LINEAR_SZ);
> +		kvfree(li);
> +		return -EINVAL;
> +	}
> +
> +	li->max_frags = linear_frag_count;
>  	rq->mpwqe.linear_info = li;
>
> -	/* additional page per packet for the linear part */
> -	*pool_size *= 2;
> +	/* Set to max to force allocation on first run. */
> +	li->frag_page.frags = li->max_frags;
>
>  	return 0;
>  }

[ ... ]

> @@ -1347,6 +1358,8 @@ void mlx5e_free_rx_descs(struct mlx5e_rq *rq)
>  			mlx5_wq_ll_pop(wq, wqe_ix_be,
>  				       &wqe->next.next_wqe_index);
>  		}
> +
> +		mlx5e_mpwqe_dealloc_linear_page(rq);
>  	} else {
>  		struct mlx5_wq_cyc *wq = &rq->wqe.wq;
>  		u16 missing = mlx5_wq_cyc_missing(wq);
> diff --git a/drivers/net/ethernet/mellanox/mlx5/core/en_rx.c b/drivers/net/ethernet/mellanox/mlx5/core/en_rx.c
> index feb042d84b8e..2ac38536afe9 100644
> --- a/drivers/net/ethernet/mellanox/mlx5/core/en_rx.c
> +++ b/drivers/net/ethernet/mellanox/mlx5/core/en_rx.c

[ ... ]

> @@ -300,6 +300,35 @@ static void mlx5e_page_release_fragmented(struct page_pool *pp,
>  		page_pool_put_unrefed_netmem(pp, netmem, -1, true);
>  }
>
> +static int mlx5e_mpwqe_linear_page_refill(struct mlx5e_rq *rq)
> +{
> +	struct mlx5e_mpw_linear_info *li = rq->mpwqe.linear_info;
> +
> +	if (likely(li->frag_page.frags < li->max_frags))
> +		return 0;
> +
> +	if (likely(li->frag_page.netmem)) {
> +		mlx5e_page_release_fragmented(rq->page_pool, &li->frag_page);
> +		li->frag_page.netmem = 0;
> +	}
> +
> +	return mlx5e_page_alloc_fragmented(rq->page_pool, &li->frag_page);
> +}
> +
> +static void *mlx5e_mpwqe_get_linear_page_frag(struct mlx5e_rq *rq)
> +{
> +	struct mlx5e_mpw_linear_info *li = rq->mpwqe.linear_info;
> +	u32 frag_offset;
> +
> +	if (unlikely(mlx5e_mpwqe_linear_page_refill(rq)))
> +		return NULL;
> +
> +	frag_offset = li->frag_page.frags << MLX5E_XDP_LOG_MAX_LINEAR_SZ;
> +	WARN_ON(frag_offset >= BIT(rq->mpwqe.page_shift));
> +
> +	return netmem_address(li->frag_page.netmem) + frag_offset;
> +}
> +

[ ... ]

> @@ -702,6 +731,16 @@ static void mlx5e_dealloc_rx_mpwqe(struct mlx5e_rq *rq, u16 ix)
>  	bitmap_fill(wi->skip_release_bitmap, rq->mpwqe.pages_per_wqe);
>  }
>
> +void mlx5e_mpwqe_dealloc_linear_page(struct mlx5e_rq *rq)
> +{
> +	struct mlx5e_mpw_linear_info *li = rq->mpwqe.linear_info;
> +
> +	if (!li || !li->frag_page.netmem)
> +		return;
> +
> +	mlx5e_page_release_fragmented(rq->page_pool, &li->frag_page);
> +}

Does this function need to reset li->frag_page.netmem to 0 and
li->frag_page.frags to li->max_frags after releasing the page?

When an RQ is flushed and then reactivated (for example, in error recovery
via mlx5e_rx_reporter_err_rq_cqe_recover()), the following sequence occurs:

mlx5e_rx_reporter_err_rq_cqe_recover()
  mlx5e_flush_rq()
    mlx5e_free_rx_descs()
      mlx5e_mpwqe_dealloc_linear_page()
        mlx5e_page_release_fragmented()  [releases the page]
        [but doesn't reset netmem or frags]
  mlx5e_activate_rq()
    [RQ becomes active again]

When the next XDP multi-buffer packet arrives, mlx5e_mpwqe_get_linear_page_frag()
is called:

mlx5e_skb_from_cqe_mpwrq_nonlinear()
  mlx5e_mpwqe_get_linear_page_frag()
    mlx5e_mpwqe_linear_page_refill()
      if (likely(li->frag_page.frags < li->max_frags))
        return 0;  [skips allocation if frags < max_frags]
    netmem_address(li->frag_page.netmem) + frag_offset  [UAF]

If li->frag_page.frags was not reset to li->max_frags in
mlx5e_mpwqe_dealloc_linear_page(), the refill function will see frags <
max_frags and skip the allocation, then compute the address using the stale
netmem pointer that was already returned to the page pool.

Shouldn't this match the initialization in mlx5e_rq_alloc_mpwqe_linear_info()
which sets li->frag_page.frags = li->max_frags?
-- 
pw-bot: cr

  reply	other threads:[~2026-03-24  2:42 UTC|newest]

Thread overview: 8+ messages / expand[flat|nested]  mbox.gz  Atom feed  top
2026-03-19  7:50 [PATCH net-next 0/5] net/mlx5e: XDP, Add support for multi-packet per page Tariq Toukan
2026-03-19  7:50 ` [PATCH net-next 1/5] net/mlx5e: XSK, Increase size for chunk_size param Tariq Toukan
2026-03-19  7:50 ` [PATCH net-next 2/5] net/mlx5e: XDP, Improve dma address calculation of linear part for XDP_TX Tariq Toukan
2026-03-19  7:50 ` [PATCH net-next 3/5] net/mlx5e: XDP, Remove stride size limitation Tariq Toukan
2026-03-19  7:50 ` [PATCH net-next 4/5] net/mlx5e: XDP, Use a single linear page per rq Tariq Toukan
2026-03-19  7:50 ` [PATCH net-next 5/5] net/mlx5e: XDP, Use page fragments for linear data in multibuf-mode Tariq Toukan
2026-03-24  2:42   ` Jakub Kicinski [this message]
2026-03-24  8:50     ` Dragos Tatulea

Reply instructions:

You may reply publicly to this message via plain-text email
using any one of the following methods:

* Save the following mbox file, import it into your mail client,
  and reply-to-all from there: mbox

  Avoid top-posting and favor interleaved quoting:
  https://en.wikipedia.org/wiki/Posting_style#Interleaved_style

* Reply using the --to, --cc, and --in-reply-to
  switches of git-send-email(1):

  git send-email \
    --in-reply-to=20260324024235.929875-1-kuba@kernel.org \
    --to=kuba@kernel.org \
    --cc=andrew+netdev@lunn.ch \
    --cc=ast@kernel.org \
    --cc=bpf@vger.kernel.org \
    --cc=cjubran@nvidia.com \
    --cc=daniel@iogearbox.net \
    --cc=davem@davemloft.net \
    --cc=dtatulea@nvidia.com \
    --cc=edumazet@google.com \
    --cc=gal@nvidia.com \
    --cc=hawk@kernel.org \
    --cc=john.fastabend@gmail.com \
    --cc=leon@kernel.org \
    --cc=linux-kernel@vger.kernel.org \
    --cc=linux-rdma@vger.kernel.org \
    --cc=mbloch@nvidia.com \
    --cc=moshe@nvidia.com \
    --cc=netdev@vger.kernel.org \
    --cc=pabeni@redhat.com \
    --cc=saeedm@nvidia.com \
    --cc=tariqt@nvidia.com \
    /path/to/YOUR_REPLY

  https://kernel.org/pub/software/scm/git/docs/git-send-email.html

* If your mail client supports setting the In-Reply-To header
  via mailto: links, try the mailto: link
Be sure your reply has a Subject: header at the top and a blank line before the message body.
This is an external index of several public inboxes,
see mirroring instructions on how to clone and mirror
all data and code used by this external index.