* [PATCH net-next v2] net/mlx5e: Precompute xdpsq assignments for mlx5e_xdp_xmit()
@ 2026-03-02 5:52 Finn Dayton
2026-03-08 7:03 ` Tariq Toukan
0 siblings, 1 reply; 2+ messages in thread
From: Finn Dayton @ 2026-03-02 5:52 UTC (permalink / raw)
To: netdev@vger.kernel.org
Cc: ast@kernel.org, daniel@iogearbox.net, davem@davemloft.net,
kuba@kernel.org, hawk@kernel.org, john.fastabend@gmail.com,
sdf@fomichev.me, saeedm@nvidia.com, leon@kernel.org,
tariqt@nvidia.com, mbloch@nvidia.com, andrew+netdev@lunn.ch,
edumazet@google.com, pabeni@redhat.com, bpf@vger.kernel.org,
linux-rdma@vger.kernel.org, linux-kernel@vger.kernel.org,
Finn Dayton
mlx5e_xdp_xmit() currently selects the xdpsq (send queue) using
smp_processor_id() (i.e. cpu id). When doing XDP_REDIRECT from a cpu
with id >= priv->channels.num, however, mlx5e_xdp_xmit() returns -ENXIO
and the redirect fails.
Previous discussion proposed using modulo or while loop subtraction
in mlx5e_xdp_xmit() to map cpu id to send queue, but these approaches
introduce hot path overhead on modern systems where the number of
logical cores >> the number of XDP send queues (xdpsq).
The below approach precomputes per-cpu priv->xdpsq assignments when
channels are (re)configured and does a constant-time lookup in
mlx5e_xdp_xmit().
Because multiple CPUs may now map to the same xdpsq (whenever cpu count
exceeds channel count), we serialize writes in xdp_xmit with a tx_lock.
Link: https://lore.kernel.org/all/610D8F9E-0038-46D9-AD8A-1D596236B1EF@spacex.com/
Link: https://lore.kernel.org/all/474c1f71-3a5c-4fe5-a01e-80f2ba95fd7e@bytedance.com/
Signed-off-by: Finn Dayton <finnius.dayton@spacex.com>
---
v2:
- Removed unnecessary guards
- Improved variable naming and placement
- Change mapping from cpu -> index to cpu -> xdpsq
- Call smp_wmb() after updates to mapping
drivers/net/ethernet/mellanox/mlx5/core/en.h | 4 +++
.../net/ethernet/mellanox/mlx5/core/en/xdp.c | 17 ++++++-------
.../net/ethernet/mellanox/mlx5/core/en_main.c | 25 +++++++++++++++++++
3 files changed, 36 insertions(+), 10 deletions(-)
diff --git a/drivers/net/ethernet/mellanox/mlx5/core/en.h b/drivers/net/ethernet/mellanox/mlx5/core/en.h
index ea2cd1f5d1d0..713dc7f9bae3 100644
--- a/drivers/net/ethernet/mellanox/mlx5/core/en.h
+++ b/drivers/net/ethernet/mellanox/mlx5/core/en.h
@@ -493,6 +493,8 @@ struct mlx5e_xdpsq {
u16 pc;
struct mlx5_wqe_ctrl_seg *doorbell_cseg;
struct mlx5e_tx_mpwqe mpwqe;
+ /* serialize writes by multiple CPUs to this send queue */
+ spinlock_t tx_lock;
struct mlx5e_cq cq;
@@ -898,6 +900,8 @@ struct mlx5e_priv {
struct mlx5e_selq selq;
struct mlx5e_txqsq **txq2sq;
struct mlx5e_sq_stats **txq2sq_stats;
+ /* selects the xdpsq during mlx5e_xdp_xmit() */
+ struct mlx5e_xdpsq * __percpu *send_queue_ptr;
#ifdef CONFIG_MLX5_CORE_EN_DCB
struct mlx5e_dcbx_dp dcbx_dp;
diff --git a/drivers/net/ethernet/mellanox/mlx5/core/en/xdp.c b/drivers/net/ethernet/mellanox/mlx5/core/en/xdp.c
index 80f9fc10877a..1db83a69055c 100644
--- a/drivers/net/ethernet/mellanox/mlx5/core/en/xdp.c
+++ b/drivers/net/ethernet/mellanox/mlx5/core/en/xdp.c
@@ -845,7 +845,6 @@ int mlx5e_xdp_xmit(struct net_device *dev, int n, struct xdp_frame **frames,
struct mlx5e_priv *priv = netdev_priv(dev);
struct mlx5e_xdpsq *sq;
int nxmit = 0;
- int sq_num;
int i;
/* this flag is sufficient, no need to test internal sq state */
@@ -854,14 +853,12 @@ int mlx5e_xdp_xmit(struct net_device *dev, int n, struct xdp_frame **frames,
if (unlikely(flags & ~XDP_XMIT_FLAGS_MASK))
return -EINVAL;
-
- sq_num = smp_processor_id();
-
- if (unlikely(sq_num >= priv->channels.num))
- return -ENXIO;
-
- sq = priv->channels.c[sq_num]->xdpsq;
-
+ /* Per-CPU xdpsq mapping, rebuilt on channel (re)configuration while XDP TX is disabled */
+ sq = *this_cpu_ptr(priv->send_queue_ptr);
+ /* The number of queues configured on a netdev may be smaller than the
+ * CPU pool, so two CPUs might map to this queue. We must serialize writes.
+ */
+ spin_lock(&sq->tx_lock);
for (i = 0; i < n; i++) {
struct mlx5e_xmit_data_frags xdptxdf = {};
struct xdp_frame *xdpf = frames[i];
@@ -941,7 +938,7 @@ int mlx5e_xdp_xmit(struct net_device *dev, int n, struct xdp_frame **frames,
if (flags & XDP_XMIT_FLUSH)
mlx5e_xmit_xdp_doorbell(sq);
-
+ spin_unlock(&sq->tx_lock);
return nxmit;
}
diff --git a/drivers/net/ethernet/mellanox/mlx5/core/en_main.c b/drivers/net/ethernet/mellanox/mlx5/core/en_main.c
index b6c12460b54a..434db74f096b 100644
--- a/drivers/net/ethernet/mellanox/mlx5/core/en_main.c
+++ b/drivers/net/ethernet/mellanox/mlx5/core/en_main.c
@@ -1505,6 +1505,7 @@ static int mlx5e_alloc_xdpsq(struct mlx5e_channel *c,
sq->stop_room = param->is_mpw ? mlx5e_stop_room_for_mpwqe(mdev) :
mlx5e_stop_room_for_max_wqe(mdev);
sq->max_sq_mpw_wqebbs = mlx5e_get_max_sq_aligned_wqebbs(mdev);
+ spin_lock_init(&sq->tx_lock);
param->wq.db_numa_node = cpu_to_node(c->cpu);
err = mlx5_wq_cyc_create(mdev, ¶m->wq, sqc_wq, wq, &sq->wq_ctrl);
@@ -3283,10 +3284,27 @@ static void mlx5e_build_txq_maps(struct mlx5e_priv *priv)
smp_wmb();
}
+static void mlx5e_build_xdpsq_maps(struct mlx5e_priv *priv)
+{
+ /* Build mapping from CPU id to XDP send queue, used by
+ * mlx5e_xdp_xmit() to determine which send queue to transmit packet on.
+ */
+ int cpu;
+
+ for_each_possible_cpu(cpu) {
+ int send_queue_idx = cpu % priv->channels.num;
+ struct mlx5e_xdpsq *sq = priv->channels.c[send_queue_idx]->xdpsq;
+ *per_cpu_ptr(priv->send_queue_ptr, cpu) = sq;
+ }
+ /* Publish the new CPU->xdpsq map before re-enabling XDP TX */
+ smp_wmb();
+}
+
void mlx5e_activate_priv_channels(struct mlx5e_priv *priv)
{
mlx5e_build_txq_maps(priv);
mlx5e_activate_channels(priv, &priv->channels);
+ mlx5e_build_xdpsq_maps(priv);
mlx5e_xdp_tx_enable(priv);
/* dev_watchdog() wants all TX queues to be started when the carrier is
@@ -6262,8 +6280,14 @@ int mlx5e_priv_init(struct mlx5e_priv *priv,
if (!priv->fec_ranges)
goto err_free_channel_stats;
+ priv->send_queue_ptr = alloc_percpu(struct mlx5e_xdpsq *);
+ if (!priv->send_queue_ptr)
+ goto err_free_fec_ranges;
+
return 0;
+err_free_fec_ranges:
+ kfree(priv->fec_ranges);
err_free_channel_stats:
kfree(priv->channel_stats);
err_free_tx_rates:
@@ -6290,6 +6314,7 @@ void mlx5e_priv_cleanup(struct mlx5e_priv *priv)
if (!priv->mdev)
return;
+ free_percpu(priv->send_queue_ptr);
kfree(priv->fec_ranges);
for (i = 0; i < priv->stats_nch; i++)
kvfree(priv->channel_stats[i]);
--
2.43.0
^ permalink raw reply related [flat|nested] 2+ messages in thread
* Re: [PATCH net-next v2] net/mlx5e: Precompute xdpsq assignments for mlx5e_xdp_xmit()
2026-03-02 5:52 [PATCH net-next v2] net/mlx5e: Precompute xdpsq assignments for mlx5e_xdp_xmit() Finn Dayton
@ 2026-03-08 7:03 ` Tariq Toukan
0 siblings, 0 replies; 2+ messages in thread
From: Tariq Toukan @ 2026-03-08 7:03 UTC (permalink / raw)
To: Finn Dayton, netdev@vger.kernel.org
Cc: ast@kernel.org, daniel@iogearbox.net, davem@davemloft.net,
kuba@kernel.org, hawk@kernel.org, john.fastabend@gmail.com,
sdf@fomichev.me, saeedm@nvidia.com, leon@kernel.org,
tariqt@nvidia.com, mbloch@nvidia.com, andrew+netdev@lunn.ch,
edumazet@google.com, pabeni@redhat.com, bpf@vger.kernel.org,
linux-rdma@vger.kernel.org, linux-kernel@vger.kernel.org
On 02/03/2026 7:52, Finn Dayton wrote:
> mlx5e_xdp_xmit() currently selects the xdpsq (send queue) using
> smp_processor_id() (i.e. cpu id). When doing XDP_REDIRECT from a cpu
> with id >= priv->channels.num, however, mlx5e_xdp_xmit() returns -ENXIO
> and the redirect fails.
>
> Previous discussion proposed using modulo or while loop subtraction
> in mlx5e_xdp_xmit() to map cpu id to send queue, but these approaches
> introduce hot path overhead on modern systems where the number of
> logical cores >> the number of XDP send queues (xdpsq).
>
> The below approach precomputes per-cpu priv->xdpsq assignments when
> channels are (re)configured and does a constant-time lookup in
> mlx5e_xdp_xmit().
>
> Because multiple CPUs may now map to the same xdpsq (whenever cpu count
> exceeds channel count), we serialize writes in xdp_xmit with a tx_lock.
>
> Link: https://lore.kernel.org/all/610D8F9E-0038-46D9-AD8A-1D596236B1EF@spacex.com/
> Link: https://lore.kernel.org/all/474c1f71-3a5c-4fe5-a01e-80f2ba95fd7e@bytedance.com/
> Signed-off-by: Finn Dayton <finnius.dayton@spacex.com>
> ---
> v2:
> - Removed unnecessary guards
> - Improved variable naming and placement
> - Change mapping from cpu -> index to cpu -> xdpsq
> - Call smp_wmb() after updates to mapping
>
Thanks for your patch. Sorry for the delayed feedback.
Were you able to do some basic packet-rate testing for it?
Any noticeable impact?
A few nits below.
> drivers/net/ethernet/mellanox/mlx5/core/en.h | 4 +++
> .../net/ethernet/mellanox/mlx5/core/en/xdp.c | 17 ++++++-------
> .../net/ethernet/mellanox/mlx5/core/en_main.c | 25 +++++++++++++++++++
> 3 files changed, 36 insertions(+), 10 deletions(-)
>
> diff --git a/drivers/net/ethernet/mellanox/mlx5/core/en.h b/drivers/net/ethernet/mellanox/mlx5/core/en.h
> index ea2cd1f5d1d0..713dc7f9bae3 100644
> --- a/drivers/net/ethernet/mellanox/mlx5/core/en.h
> +++ b/drivers/net/ethernet/mellanox/mlx5/core/en.h
> @@ -493,6 +493,8 @@ struct mlx5e_xdpsq {
> u16 pc;
> struct mlx5_wqe_ctrl_seg *doorbell_cseg;
> struct mlx5e_tx_mpwqe mpwqe;
> + /* serialize writes by multiple CPUs to this send queue */
> + spinlock_t tx_lock;
>
> struct mlx5e_cq cq;
>
> @@ -898,6 +900,8 @@ struct mlx5e_priv {
> struct mlx5e_selq selq;
> struct mlx5e_txqsq **txq2sq;
> struct mlx5e_sq_stats **txq2sq_stats;
> + /* selects the xdpsq during mlx5e_xdp_xmit() */
> + struct mlx5e_xdpsq * __percpu *send_queue_ptr;
>
> #ifdef CONFIG_MLX5_CORE_EN_DCB
> struct mlx5e_dcbx_dp dcbx_dp;
> diff --git a/drivers/net/ethernet/mellanox/mlx5/core/en/xdp.c b/drivers/net/ethernet/mellanox/mlx5/core/en/xdp.c
> index 80f9fc10877a..1db83a69055c 100644
> --- a/drivers/net/ethernet/mellanox/mlx5/core/en/xdp.c
> +++ b/drivers/net/ethernet/mellanox/mlx5/core/en/xdp.c
> @@ -845,7 +845,6 @@ int mlx5e_xdp_xmit(struct net_device *dev, int n, struct xdp_frame **frames,
> struct mlx5e_priv *priv = netdev_priv(dev);
> struct mlx5e_xdpsq *sq;
> int nxmit = 0;
> - int sq_num;
> int i;
>
> /* this flag is sufficient, no need to test internal sq state */
> @@ -854,14 +853,12 @@ int mlx5e_xdp_xmit(struct net_device *dev, int n, struct xdp_frame **frames,
>
> if (unlikely(flags & ~XDP_XMIT_FLAGS_MASK))
> return -EINVAL;
> -
> - sq_num = smp_processor_id();
> -
> - if (unlikely(sq_num >= priv->channels.num))
> - return -ENXIO;
> -
> - sq = priv->channels.c[sq_num]->xdpsq;
> -
> + /* Per-CPU xdpsq mapping, rebuilt on channel (re)configuration while XDP TX is disabled */
Line too long?
> + sq = *this_cpu_ptr(priv->send_queue_ptr);
> + /* The number of queues configured on a netdev may be smaller than the
> + * CPU pool, so two CPUs might map to this queue. We must serialize writes.
> + */
> + spin_lock(&sq->tx_lock);
> for (i = 0; i < n; i++) {
> struct mlx5e_xmit_data_frags xdptxdf = {};
> struct xdp_frame *xdpf = frames[i];
> @@ -941,7 +938,7 @@ int mlx5e_xdp_xmit(struct net_device *dev, int n, struct xdp_frame **frames,
>
> if (flags & XDP_XMIT_FLUSH)
> mlx5e_xmit_xdp_doorbell(sq);
> -
> + spin_unlock(&sq->tx_lock);
> return nxmit;
> }
>
> diff --git a/drivers/net/ethernet/mellanox/mlx5/core/en_main.c b/drivers/net/ethernet/mellanox/mlx5/core/en_main.c
> index b6c12460b54a..434db74f096b 100644
> --- a/drivers/net/ethernet/mellanox/mlx5/core/en_main.c
> +++ b/drivers/net/ethernet/mellanox/mlx5/core/en_main.c
> @@ -1505,6 +1505,7 @@ static int mlx5e_alloc_xdpsq(struct mlx5e_channel *c,
> sq->stop_room = param->is_mpw ? mlx5e_stop_room_for_mpwqe(mdev) :
> mlx5e_stop_room_for_max_wqe(mdev);
> sq->max_sq_mpw_wqebbs = mlx5e_get_max_sq_aligned_wqebbs(mdev);
> + spin_lock_init(&sq->tx_lock);
>
> param->wq.db_numa_node = cpu_to_node(c->cpu);
> err = mlx5_wq_cyc_create(mdev, ¶m->wq, sqc_wq, wq, &sq->wq_ctrl);
> @@ -3283,10 +3284,27 @@ static void mlx5e_build_txq_maps(struct mlx5e_priv *priv)
> smp_wmb();
> }
>
> +static void mlx5e_build_xdpsq_maps(struct mlx5e_priv *priv)
> +{
> + /* Build mapping from CPU id to XDP send queue, used by
> + * mlx5e_xdp_xmit() to determine which send queue to transmit packet on.
> + */
> + int cpu;
> +
> + for_each_possible_cpu(cpu) {
> + int send_queue_idx = cpu % priv->channels.num;
> + struct mlx5e_xdpsq *sq = priv->channels.c[send_queue_idx]->xdpsq;
maintain RCT.
> + *per_cpu_ptr(priv->send_queue_ptr, cpu) = sq;
> + }
> + /* Publish the new CPU->xdpsq map before re-enabling XDP TX */
> + smp_wmb();
> +}
> +
> void mlx5e_activate_priv_channels(struct mlx5e_priv *priv)
> {
> mlx5e_build_txq_maps(priv);
> mlx5e_activate_channels(priv, &priv->channels);
> + mlx5e_build_xdpsq_maps(priv);
> mlx5e_xdp_tx_enable(priv);
>
> /* dev_watchdog() wants all TX queues to be started when the carrier is
> @@ -6262,8 +6280,14 @@ int mlx5e_priv_init(struct mlx5e_priv *priv,
> if (!priv->fec_ranges)
> goto err_free_channel_stats;
>
> + priv->send_queue_ptr = alloc_percpu(struct mlx5e_xdpsq *);
> + if (!priv->send_queue_ptr)
> + goto err_free_fec_ranges;
> +
> return 0;
>
> +err_free_fec_ranges:
> + kfree(priv->fec_ranges);
> err_free_channel_stats:
> kfree(priv->channel_stats);
> err_free_tx_rates:
> @@ -6290,6 +6314,7 @@ void mlx5e_priv_cleanup(struct mlx5e_priv *priv)
> if (!priv->mdev)
> return;
>
> + free_percpu(priv->send_queue_ptr);
keep symmetric to the order in mlx5e_priv_init.
> kfree(priv->fec_ranges);
> for (i = 0; i < priv->stats_nch; i++)
> kvfree(priv->channel_stats[i]);
^ permalink raw reply [flat|nested] 2+ messages in thread
end of thread, other threads:[~2026-03-08 7:03 UTC | newest]
Thread overview: 2+ messages (download: mbox.gz follow: Atom feed
-- links below jump to the message on this page --
2026-03-02 5:52 [PATCH net-next v2] net/mlx5e: Precompute xdpsq assignments for mlx5e_xdp_xmit() Finn Dayton
2026-03-08 7:03 ` Tariq Toukan
This is a public inbox, see mirroring instructions
for how to clone and mirror all data and code used for this inbox