public inbox for netdev@vger.kernel.org
 help / color / mirror / Atom feed
* [PATCH net-next v2] net/mlx5e: Precompute xdpsq assignments for mlx5e_xdp_xmit()
@ 2026-03-02  5:52 Finn Dayton
  2026-03-08  7:03 ` Tariq Toukan
  0 siblings, 1 reply; 2+ messages in thread
From: Finn Dayton @ 2026-03-02  5:52 UTC (permalink / raw)
  To: netdev@vger.kernel.org
  Cc: ast@kernel.org, daniel@iogearbox.net, davem@davemloft.net,
	kuba@kernel.org, hawk@kernel.org, john.fastabend@gmail.com,
	sdf@fomichev.me, saeedm@nvidia.com, leon@kernel.org,
	tariqt@nvidia.com, mbloch@nvidia.com, andrew+netdev@lunn.ch,
	edumazet@google.com, pabeni@redhat.com, bpf@vger.kernel.org,
	linux-rdma@vger.kernel.org, linux-kernel@vger.kernel.org,
	Finn Dayton

mlx5e_xdp_xmit() currently selects the xdpsq (send queue) using
smp_processor_id() (i.e. cpu id). When doing XDP_REDIRECT from a cpu
with id >= priv->channels.num, however, mlx5e_xdp_xmit() returns -ENXIO
and the redirect fails.

Previous discussion proposed using modulo or while loop subtraction
in mlx5e_xdp_xmit() to map cpu id to send queue, but these approaches
introduce hot path overhead on modern systems where the number of
logical cores >> the number of XDP send queues (xdpsq).

The below approach precomputes per-cpu priv->xdpsq assignments when
channels are (re)configured and does a constant-time lookup in
mlx5e_xdp_xmit().

Because multiple CPUs may now map to the same xdpsq (whenever cpu count
exceeds channel count), we serialize writes in xdp_xmit with a tx_lock.

Link: https://lore.kernel.org/all/610D8F9E-0038-46D9-AD8A-1D596236B1EF@spacex.com/
Link: https://lore.kernel.org/all/474c1f71-3a5c-4fe5-a01e-80f2ba95fd7e@bytedance.com/
Signed-off-by: Finn Dayton <finnius.dayton@spacex.com>
---
v2:
- Removed unnecessary guards
- Improved variable naming and placement 
- Change mapping from cpu -> index to cpu -> xdpsq 
- Call smp_wmb() after updates to mapping 

 drivers/net/ethernet/mellanox/mlx5/core/en.h  |  4 +++
 .../net/ethernet/mellanox/mlx5/core/en/xdp.c  | 17 ++++++-------
 .../net/ethernet/mellanox/mlx5/core/en_main.c | 25 +++++++++++++++++++
 3 files changed, 36 insertions(+), 10 deletions(-)

diff --git a/drivers/net/ethernet/mellanox/mlx5/core/en.h b/drivers/net/ethernet/mellanox/mlx5/core/en.h
index ea2cd1f5d1d0..713dc7f9bae3 100644
--- a/drivers/net/ethernet/mellanox/mlx5/core/en.h
+++ b/drivers/net/ethernet/mellanox/mlx5/core/en.h
@@ -493,6 +493,8 @@ struct mlx5e_xdpsq {
 	u16                        pc;
 	struct mlx5_wqe_ctrl_seg   *doorbell_cseg;
 	struct mlx5e_tx_mpwqe      mpwqe;
+	/* serialize writes by multiple CPUs to this send queue */
+	spinlock_t                 tx_lock;
 
 	struct mlx5e_cq            cq;
 
@@ -898,6 +900,8 @@ struct mlx5e_priv {
 	struct mlx5e_selq selq;
 	struct mlx5e_txqsq **txq2sq;
 	struct mlx5e_sq_stats **txq2sq_stats;
+	/* selects the xdpsq during mlx5e_xdp_xmit() */
+	struct mlx5e_xdpsq * __percpu *send_queue_ptr;
 
 #ifdef CONFIG_MLX5_CORE_EN_DCB
 	struct mlx5e_dcbx_dp       dcbx_dp;
diff --git a/drivers/net/ethernet/mellanox/mlx5/core/en/xdp.c b/drivers/net/ethernet/mellanox/mlx5/core/en/xdp.c
index 80f9fc10877a..1db83a69055c 100644
--- a/drivers/net/ethernet/mellanox/mlx5/core/en/xdp.c
+++ b/drivers/net/ethernet/mellanox/mlx5/core/en/xdp.c
@@ -845,7 +845,6 @@ int mlx5e_xdp_xmit(struct net_device *dev, int n, struct xdp_frame **frames,
 	struct mlx5e_priv *priv = netdev_priv(dev);
 	struct mlx5e_xdpsq *sq;
 	int nxmit = 0;
-	int sq_num;
 	int i;
 
 	/* this flag is sufficient, no need to test internal sq state */
@@ -854,14 +853,12 @@ int mlx5e_xdp_xmit(struct net_device *dev, int n, struct xdp_frame **frames,
 
 	if (unlikely(flags & ~XDP_XMIT_FLAGS_MASK))
 		return -EINVAL;
-
-	sq_num = smp_processor_id();
-
-	if (unlikely(sq_num >= priv->channels.num))
-		return -ENXIO;
-
-	sq = priv->channels.c[sq_num]->xdpsq;
-
+	/* Per-CPU xdpsq mapping, rebuilt on channel (re)configuration while XDP TX is disabled */
+	sq = *this_cpu_ptr(priv->send_queue_ptr);
+	/* The number of queues configured on a netdev may be smaller than the
+	 * CPU pool, so two CPUs might map to this queue. We must serialize writes.
+	 */
+	spin_lock(&sq->tx_lock);
 	for (i = 0; i < n; i++) {
 		struct mlx5e_xmit_data_frags xdptxdf = {};
 		struct xdp_frame *xdpf = frames[i];
@@ -941,7 +938,7 @@ int mlx5e_xdp_xmit(struct net_device *dev, int n, struct xdp_frame **frames,
 
 	if (flags & XDP_XMIT_FLUSH)
 		mlx5e_xmit_xdp_doorbell(sq);
-
+	spin_unlock(&sq->tx_lock);
 	return nxmit;
 }
 
diff --git a/drivers/net/ethernet/mellanox/mlx5/core/en_main.c b/drivers/net/ethernet/mellanox/mlx5/core/en_main.c
index b6c12460b54a..434db74f096b 100644
--- a/drivers/net/ethernet/mellanox/mlx5/core/en_main.c
+++ b/drivers/net/ethernet/mellanox/mlx5/core/en_main.c
@@ -1505,6 +1505,7 @@ static int mlx5e_alloc_xdpsq(struct mlx5e_channel *c,
 	sq->stop_room = param->is_mpw ? mlx5e_stop_room_for_mpwqe(mdev) :
 					mlx5e_stop_room_for_max_wqe(mdev);
 	sq->max_sq_mpw_wqebbs = mlx5e_get_max_sq_aligned_wqebbs(mdev);
+	spin_lock_init(&sq->tx_lock);
 
 	param->wq.db_numa_node = cpu_to_node(c->cpu);
 	err = mlx5_wq_cyc_create(mdev, &param->wq, sqc_wq, wq, &sq->wq_ctrl);
@@ -3283,10 +3284,27 @@ static void mlx5e_build_txq_maps(struct mlx5e_priv *priv)
 	smp_wmb();
 }
 
+static void mlx5e_build_xdpsq_maps(struct mlx5e_priv *priv)
+{
+	/* Build mapping from CPU id to XDP send queue, used by
+	 * mlx5e_xdp_xmit() to determine which send queue to transmit packet on.
+	 */
+	int cpu;
+
+	for_each_possible_cpu(cpu) {
+		int send_queue_idx = cpu % priv->channels.num;
+		struct mlx5e_xdpsq *sq = priv->channels.c[send_queue_idx]->xdpsq;
+		*per_cpu_ptr(priv->send_queue_ptr, cpu) = sq;
+	}
+	/* Publish the new CPU->xdpsq map before re-enabling XDP TX */
+	smp_wmb();
+}
+
 void mlx5e_activate_priv_channels(struct mlx5e_priv *priv)
 {
 	mlx5e_build_txq_maps(priv);
 	mlx5e_activate_channels(priv, &priv->channels);
+	mlx5e_build_xdpsq_maps(priv);
 	mlx5e_xdp_tx_enable(priv);
 
 	/* dev_watchdog() wants all TX queues to be started when the carrier is
@@ -6262,8 +6280,14 @@ int mlx5e_priv_init(struct mlx5e_priv *priv,
 	if (!priv->fec_ranges)
 		goto err_free_channel_stats;
 
+	priv->send_queue_ptr = alloc_percpu(struct mlx5e_xdpsq *);
+	if (!priv->send_queue_ptr)
+		goto err_free_fec_ranges;
+
 	return 0;
 
+err_free_fec_ranges:
+	kfree(priv->fec_ranges);
 err_free_channel_stats:
 	kfree(priv->channel_stats);
 err_free_tx_rates:
@@ -6290,6 +6314,7 @@ void mlx5e_priv_cleanup(struct mlx5e_priv *priv)
 	if (!priv->mdev)
 		return;
 
+	free_percpu(priv->send_queue_ptr);
 	kfree(priv->fec_ranges);
 	for (i = 0; i < priv->stats_nch; i++)
 		kvfree(priv->channel_stats[i]);
-- 
2.43.0



^ permalink raw reply related	[flat|nested] 2+ messages in thread

end of thread, other threads:[~2026-03-08  7:03 UTC | newest]

Thread overview: 2+ messages (download: mbox.gz follow: Atom feed
-- links below jump to the message on this page --
2026-03-02  5:52 [PATCH net-next v2] net/mlx5e: Precompute xdpsq assignments for mlx5e_xdp_xmit() Finn Dayton
2026-03-08  7:03 ` Tariq Toukan

This is a public inbox, see mirroring instructions
for how to clone and mirror all data and code used for this inbox