* [PATCH net-next] net/mlx5e: Undo saving per-channel async ICOSQ
@ 2026-01-23 22:39 Daniel Borkmann
2026-01-25 8:33 ` Tariq Toukan
0 siblings, 1 reply; 12+ messages in thread
From: Daniel Borkmann @ 2026-01-23 22:39 UTC (permalink / raw)
To: netdev; +Cc: William Tu, Tariq Toukan, David Wei, Jakub Kicinski
This reverts the following commits:
- ea945f4f3991 ("net/mlx5e: Move async ICOSQ lock into ICOSQ struct")
- 56aca3e0f730 ("net/mlx5e: Use regular ICOSQ for triggering NAPI")
- 1b080bd74840 ("net/mlx5e: Move async ICOSQ to dynamic allocation")
- abed42f9cd80 ("net/mlx5e: Conditionally create async ICOSQ")
There are a couple of regressions on the xsk side I ran into:
Commit 56aca3e0f730 triggers an illegal synchronize_rcu() in an RCU read-
side critical section via mlx5e_xsk_wakeup() -> mlx5e_trigger_napi_icosq()
-> synchronize_net(). The stack holds RCU read-lock in xsk_poll().
Additionally, this also hits a NULL pointer dereference in mlx5e_xsk_wakeup():
[ 103.963735] BUG: kernel NULL pointer dereference, address: 0000000000000240
[ 103.963743] #PF: supervisor read access in kernel mode
[ 103.963746] #PF: error_code(0x0000) - not-present page
[ 103.963749] PGD 0 P4D 0
[ 103.963752] Oops: Oops: 0000 [#1] SMP
[ 103.963756] CPU: 0 UID: 0 PID: 2255 Comm: qemu-system-x86 Not tainted 6.19.0-rc5+ #229 PREEMPT(none)
[ 103.963761] Hardware name: [...]
[ 103.963765] RIP: 0010:mlx5e_xsk_wakeup+0x53/0x90 [mlx5_core]
What happens is that c->async_icosq is NULL when in mlx5e_xsk_wakeup()
and therefore access to c->async_icosq->state triggers it. (On the NIC
there is an XDP program installed by the control plane where traffic
gets redirected into an xsk map - there was no xsk pool set up yet.
At some later time a xsk pool is set up and the related xsk socket is
added to the xsk map of the XDP program.)
Reverting the series fixes the problems again.
Signed-off-by: Daniel Borkmann <daniel@iogearbox.net>
Cc: William Tu <witu@nvidia.com>
Cc: Tariq Toukan <tariqt@nvidia.com>
Cc: David Wei <dw@davidwei.uk>
Cc: Jakub Kicinski <kuba@kernel.org>
---
drivers/net/ethernet/mellanox/mlx5/core/en.h | 26 +----
.../mellanox/mlx5/core/en/reporter_tx.c | 1 -
.../ethernet/mellanox/mlx5/core/en/xsk/rx.c | 3 -
.../ethernet/mellanox/mlx5/core/en/xsk/tx.c | 6 +-
.../mellanox/mlx5/core/en_accel/ktls.c | 10 +-
.../mellanox/mlx5/core/en_accel/ktls_rx.c | 26 ++---
.../mellanox/mlx5/core/en_accel/ktls_txrx.h | 3 +-
.../net/ethernet/mellanox/mlx5/core/en_main.c | 100 +++++-------------
.../net/ethernet/mellanox/mlx5/core/en_rx.c | 4 -
.../net/ethernet/mellanox/mlx5/core/en_txrx.c | 37 +++----
10 files changed, 62 insertions(+), 154 deletions(-)
diff --git a/drivers/net/ethernet/mellanox/mlx5/core/en.h b/drivers/net/ethernet/mellanox/mlx5/core/en.h
index 19b9683f4622..ff4ab4691baf 100644
--- a/drivers/net/ethernet/mellanox/mlx5/core/en.h
+++ b/drivers/net/ethernet/mellanox/mlx5/core/en.h
@@ -388,7 +388,6 @@ enum {
MLX5E_SQ_STATE_DIM,
MLX5E_SQ_STATE_PENDING_XSK_TX,
MLX5E_SQ_STATE_PENDING_TLS_RX_RESYNC,
- MLX5E_SQ_STATE_LOCK_NEEDED,
MLX5E_NUM_SQ_STATES, /* Must be kept last */
};
@@ -546,11 +545,6 @@ struct mlx5e_icosq {
u32 sqn;
u16 reserved_room;
unsigned long state;
- /* icosq can be accessed from any CPU and from different contexts
- * (NAPI softirq or process/workqueue). Always use spin_lock_bh for
- * simplicity and correctness across all contexts.
- */
- spinlock_t lock;
struct mlx5e_ktls_resync_resp *ktls_resync;
/* control path */
@@ -782,7 +776,9 @@ struct mlx5e_channel {
struct mlx5e_xdpsq xsksq;
/* Async ICOSQ */
- struct mlx5e_icosq *async_icosq;
+ struct mlx5e_icosq async_icosq;
+ /* async_icosq can be accessed from any CPU - the spinlock protects it. */
+ spinlock_t async_icosq_lock;
/* data path - accessed per napi poll */
const struct cpumask *aff_mask;
@@ -805,21 +801,6 @@ struct mlx5e_channel {
struct dim_cq_moder tx_cq_moder;
};
-static inline bool mlx5e_icosq_sync_lock(struct mlx5e_icosq *sq)
-{
- if (likely(!test_bit(MLX5E_SQ_STATE_LOCK_NEEDED, &sq->state)))
- return false;
-
- spin_lock_bh(&sq->lock);
- return true;
-}
-
-static inline void mlx5e_icosq_sync_unlock(struct mlx5e_icosq *sq, bool locked)
-{
- if (unlikely(locked))
- spin_unlock_bh(&sq->lock);
-}
-
struct mlx5e_ptp;
struct mlx5e_channels {
@@ -939,7 +920,6 @@ struct mlx5e_priv {
u8 max_opened_tc;
bool tx_ptp_opened;
bool rx_ptp_opened;
- bool ktls_rx_was_enabled;
struct kernel_hwtstamp_config hwtstamp_config;
u16 q_counter[MLX5_SD_MAX_GROUP_SZ];
u16 drop_rq_q_counter;
diff --git a/drivers/net/ethernet/mellanox/mlx5/core/en/reporter_tx.c b/drivers/net/ethernet/mellanox/mlx5/core/en/reporter_tx.c
index 4adc1adf9897..9e2cf191ed30 100644
--- a/drivers/net/ethernet/mellanox/mlx5/core/en/reporter_tx.c
+++ b/drivers/net/ethernet/mellanox/mlx5/core/en/reporter_tx.c
@@ -15,7 +15,6 @@ static const char * const sq_sw_state_type_name[] = {
[MLX5E_SQ_STATE_DIM] = "dim",
[MLX5E_SQ_STATE_PENDING_XSK_TX] = "pending_xsk_tx",
[MLX5E_SQ_STATE_PENDING_TLS_RX_RESYNC] = "pending_tls_rx_resync",
- [MLX5E_SQ_STATE_LOCK_NEEDED] = "lock_needed",
};
static int mlx5e_wait_for_sq_flush(struct mlx5e_txqsq *sq)
diff --git a/drivers/net/ethernet/mellanox/mlx5/core/en/xsk/rx.c b/drivers/net/ethernet/mellanox/mlx5/core/en/xsk/rx.c
index 4f984f6a2cb9..2b05536d564a 100644
--- a/drivers/net/ethernet/mellanox/mlx5/core/en/xsk/rx.c
+++ b/drivers/net/ethernet/mellanox/mlx5/core/en/xsk/rx.c
@@ -23,7 +23,6 @@ int mlx5e_xsk_alloc_rx_mpwqe(struct mlx5e_rq *rq, u16 ix)
struct mlx5_wq_cyc *wq = &icosq->wq;
struct mlx5e_umr_wqe *umr_wqe;
struct xdp_buff **xsk_buffs;
- bool sync_locked;
int batch, i;
u32 offset; /* 17-bit value with MTT. */
u16 pi;
@@ -48,7 +47,6 @@ int mlx5e_xsk_alloc_rx_mpwqe(struct mlx5e_rq *rq, u16 ix)
goto err_reuse_batch;
}
- sync_locked = mlx5e_icosq_sync_lock(icosq);
pi = mlx5e_icosq_get_next_pi(icosq, rq->mpwqe.umr_wqebbs);
umr_wqe = mlx5_wq_cyc_get_wqe(wq, pi);
memcpy(umr_wqe, &rq->mpwqe.umr_wqe, sizeof(struct mlx5e_umr_wqe));
@@ -145,7 +143,6 @@ int mlx5e_xsk_alloc_rx_mpwqe(struct mlx5e_rq *rq, u16 ix)
};
icosq->pc += rq->mpwqe.umr_wqebbs;
- mlx5e_icosq_sync_unlock(icosq, sync_locked);
icosq->doorbell_cseg = &umr_wqe->hdr.ctrl;
diff --git a/drivers/net/ethernet/mellanox/mlx5/core/en/xsk/tx.c b/drivers/net/ethernet/mellanox/mlx5/core/en/xsk/tx.c
index 9e33156fac8a..a59199ed590d 100644
--- a/drivers/net/ethernet/mellanox/mlx5/core/en/xsk/tx.c
+++ b/drivers/net/ethernet/mellanox/mlx5/core/en/xsk/tx.c
@@ -26,12 +26,10 @@ int mlx5e_xsk_wakeup(struct net_device *dev, u32 qid, u32 flags)
* active and not polled by NAPI. Return 0, because the upcoming
* activate will trigger the IRQ for us.
*/
- if (unlikely(!test_bit(MLX5E_SQ_STATE_ENABLED,
- &c->async_icosq->state)))
+ if (unlikely(!test_bit(MLX5E_SQ_STATE_ENABLED, &c->async_icosq.state)))
return 0;
- if (test_and_set_bit(MLX5E_SQ_STATE_PENDING_XSK_TX,
- &c->async_icosq->state))
+ if (test_and_set_bit(MLX5E_SQ_STATE_PENDING_XSK_TX, &c->async_icosq.state))
return 0;
mlx5e_trigger_napi_icosq(c);
diff --git a/drivers/net/ethernet/mellanox/mlx5/core/en_accel/ktls.c b/drivers/net/ethernet/mellanox/mlx5/core/en_accel/ktls.c
index 1c2cc2aad2b0..e3e57c849436 100644
--- a/drivers/net/ethernet/mellanox/mlx5/core/en_accel/ktls.c
+++ b/drivers/net/ethernet/mellanox/mlx5/core/en_accel/ktls.c
@@ -135,15 +135,10 @@ int mlx5e_ktls_set_feature_rx(struct net_device *netdev, bool enable)
int err = 0;
mutex_lock(&priv->state_lock);
- if (enable) {
+ if (enable)
err = mlx5e_accel_fs_tcp_create(priv->fs);
- if (!err && !priv->ktls_rx_was_enabled) {
- priv->ktls_rx_was_enabled = true;
- mlx5e_safe_reopen_channels(priv);
- }
- } else {
+ else
mlx5e_accel_fs_tcp_destroy(priv->fs);
- }
mutex_unlock(&priv->state_lock);
return err;
@@ -166,7 +161,6 @@ int mlx5e_ktls_init_rx(struct mlx5e_priv *priv)
destroy_workqueue(priv->tls->rx_wq);
return err;
}
- priv->ktls_rx_was_enabled = true;
}
return 0;
diff --git a/drivers/net/ethernet/mellanox/mlx5/core/en_accel/ktls_rx.c b/drivers/net/ethernet/mellanox/mlx5/core/en_accel/ktls_rx.c
index 5d8fe252799e..da2d1eb52c13 100644
--- a/drivers/net/ethernet/mellanox/mlx5/core/en_accel/ktls_rx.c
+++ b/drivers/net/ethernet/mellanox/mlx5/core/en_accel/ktls_rx.c
@@ -202,8 +202,8 @@ static int post_rx_param_wqes(struct mlx5e_channel *c,
int err;
err = 0;
- sq = c->async_icosq;
- spin_lock_bh(&sq->lock);
+ sq = &c->async_icosq;
+ spin_lock_bh(&c->async_icosq_lock);
cseg = post_static_params(sq, priv_rx);
if (IS_ERR(cseg))
@@ -214,7 +214,7 @@ static int post_rx_param_wqes(struct mlx5e_channel *c,
mlx5e_notify_hw(&sq->wq, sq->pc, sq->uar_map, cseg);
unlock:
- spin_unlock_bh(&sq->lock);
+ spin_unlock_bh(&c->async_icosq_lock);
return err;
@@ -277,10 +277,10 @@ resync_post_get_progress_params(struct mlx5e_icosq *sq,
buf->priv_rx = priv_rx;
- spin_lock_bh(&sq->lock);
+ spin_lock_bh(&sq->channel->async_icosq_lock);
if (unlikely(!mlx5e_icosq_can_post_wqe(sq, MLX5E_KTLS_GET_PROGRESS_WQEBBS))) {
- spin_unlock_bh(&sq->lock);
+ spin_unlock_bh(&sq->channel->async_icosq_lock);
err = -ENOSPC;
goto err_dma_unmap;
}
@@ -311,7 +311,7 @@ resync_post_get_progress_params(struct mlx5e_icosq *sq,
icosq_fill_wi(sq, pi, &wi);
sq->pc++;
mlx5e_notify_hw(&sq->wq, sq->pc, sq->uar_map, cseg);
- spin_unlock_bh(&sq->lock);
+ spin_unlock_bh(&sq->channel->async_icosq_lock);
return 0;
@@ -344,7 +344,7 @@ static void resync_handle_work(struct work_struct *work)
}
c = resync->priv->channels.c[priv_rx->rxq];
- sq = c->async_icosq;
+ sq = &c->async_icosq;
if (resync_post_get_progress_params(sq, priv_rx)) {
priv_rx->rq_stats->tls_resync_req_skip++;
@@ -371,7 +371,7 @@ static void resync_handle_seq_match(struct mlx5e_ktls_offload_context_rx *priv_r
struct mlx5e_icosq *sq;
bool trigger_poll;
- sq = c->async_icosq;
+ sq = &c->async_icosq;
ktls_resync = sq->ktls_resync;
trigger_poll = false;
@@ -413,9 +413,9 @@ static void resync_handle_seq_match(struct mlx5e_ktls_offload_context_rx *priv_r
return;
if (!napi_if_scheduled_mark_missed(&c->napi)) {
- spin_lock_bh(&sq->lock);
+ spin_lock_bh(&c->async_icosq_lock);
mlx5e_trigger_irq(sq);
- spin_unlock_bh(&sq->lock);
+ spin_unlock_bh(&c->async_icosq_lock);
}
}
@@ -753,7 +753,7 @@ bool mlx5e_ktls_rx_handle_resync_list(struct mlx5e_channel *c, int budget)
LIST_HEAD(local_list);
int i, j;
- sq = c->async_icosq;
+ sq = &c->async_icosq;
if (unlikely(!test_bit(MLX5E_SQ_STATE_ENABLED, &sq->state)))
return false;
@@ -772,7 +772,7 @@ bool mlx5e_ktls_rx_handle_resync_list(struct mlx5e_channel *c, int budget)
clear_bit(MLX5E_SQ_STATE_PENDING_TLS_RX_RESYNC, &sq->state);
spin_unlock(&ktls_resync->lock);
- spin_lock(&sq->lock);
+ spin_lock(&c->async_icosq_lock);
for (j = 0; j < i; j++) {
struct mlx5_wqe_ctrl_seg *cseg;
@@ -791,7 +791,7 @@ bool mlx5e_ktls_rx_handle_resync_list(struct mlx5e_channel *c, int budget)
}
if (db_cseg)
mlx5e_notify_hw(&sq->wq, sq->pc, sq->uar_map, db_cseg);
- spin_unlock(&sq->lock);
+ spin_unlock(&c->async_icosq_lock);
priv_rx->rq_stats->tls_resync_res_ok += j;
diff --git a/drivers/net/ethernet/mellanox/mlx5/core/en_accel/ktls_txrx.h b/drivers/net/ethernet/mellanox/mlx5/core/en_accel/ktls_txrx.h
index 4022c7e78a2e..cb08799769ee 100644
--- a/drivers/net/ethernet/mellanox/mlx5/core/en_accel/ktls_txrx.h
+++ b/drivers/net/ethernet/mellanox/mlx5/core/en_accel/ktls_txrx.h
@@ -50,8 +50,7 @@ bool mlx5e_ktls_rx_handle_resync_list(struct mlx5e_channel *c, int budget);
static inline bool
mlx5e_ktls_rx_pending_resync_list(struct mlx5e_channel *c, int budget)
{
- return budget && test_bit(MLX5E_SQ_STATE_PENDING_TLS_RX_RESYNC,
- &c->async_icosq->state);
+ return budget && test_bit(MLX5E_SQ_STATE_PENDING_TLS_RX_RESYNC, &c->async_icosq.state);
}
static inline void
diff --git a/drivers/net/ethernet/mellanox/mlx5/core/en_main.c b/drivers/net/ethernet/mellanox/mlx5/core/en_main.c
index e1e05c9e7ebb..446510153e5e 100644
--- a/drivers/net/ethernet/mellanox/mlx5/core/en_main.c
+++ b/drivers/net/ethernet/mellanox/mlx5/core/en_main.c
@@ -2075,8 +2075,6 @@ static int mlx5e_open_icosq(struct mlx5e_channel *c, struct mlx5e_params *params
if (err)
goto err_free_icosq;
- spin_lock_init(&sq->lock);
-
if (param->is_tls) {
sq->ktls_resync = mlx5e_ktls_rx_resync_create_resp_list();
if (IS_ERR(sq->ktls_resync)) {
@@ -2589,51 +2587,9 @@ static int mlx5e_open_rxq_rq(struct mlx5e_channel *c, struct mlx5e_params *param
return mlx5e_open_rq(params, rq_params, NULL, cpu_to_node(c->cpu), q_counter, &c->rq);
}
-static struct mlx5e_icosq *
-mlx5e_open_async_icosq(struct mlx5e_channel *c,
- struct mlx5e_params *params,
- struct mlx5e_channel_param *cparam,
- struct mlx5e_create_cq_param *ccp)
-{
- struct dim_cq_moder icocq_moder = {0, 0};
- struct mlx5e_icosq *async_icosq;
- int err;
-
- async_icosq = kvzalloc_node(sizeof(*async_icosq), GFP_KERNEL,
- cpu_to_node(c->cpu));
- if (!async_icosq)
- return ERR_PTR(-ENOMEM);
-
- err = mlx5e_open_cq(c->mdev, icocq_moder, &cparam->async_icosq.cqp, ccp,
- &async_icosq->cq);
- if (err)
- goto err_free_async_icosq;
-
- err = mlx5e_open_icosq(c, params, &cparam->async_icosq, async_icosq,
- mlx5e_async_icosq_err_cqe_work);
- if (err)
- goto err_close_async_icosq_cq;
-
- return async_icosq;
-
-err_close_async_icosq_cq:
- mlx5e_close_cq(&async_icosq->cq);
-err_free_async_icosq:
- kvfree(async_icosq);
- return ERR_PTR(err);
-}
-
-static void mlx5e_close_async_icosq(struct mlx5e_icosq *async_icosq)
-{
- mlx5e_close_icosq(async_icosq);
- mlx5e_close_cq(&async_icosq->cq);
- kvfree(async_icosq);
-}
-
static int mlx5e_open_queues(struct mlx5e_channel *c,
struct mlx5e_params *params,
- struct mlx5e_channel_param *cparam,
- bool async_icosq_needed)
+ struct mlx5e_channel_param *cparam)
{
const struct net_device_ops *netdev_ops = c->netdev->netdev_ops;
struct dim_cq_moder icocq_moder = {0, 0};
@@ -2642,10 +2598,15 @@ static int mlx5e_open_queues(struct mlx5e_channel *c,
mlx5e_build_create_cq_param(&ccp, c);
+ err = mlx5e_open_cq(c->mdev, icocq_moder, &cparam->async_icosq.cqp, &ccp,
+ &c->async_icosq.cq);
+ if (err)
+ return err;
+
err = mlx5e_open_cq(c->mdev, icocq_moder, &cparam->icosq.cqp, &ccp,
&c->icosq.cq);
if (err)
- return err;
+ goto err_close_async_icosq_cq;
err = mlx5e_open_tx_cqs(c, params, &ccp, cparam);
if (err)
@@ -2669,14 +2630,12 @@ static int mlx5e_open_queues(struct mlx5e_channel *c,
if (err)
goto err_close_rx_cq;
- if (async_icosq_needed) {
- c->async_icosq = mlx5e_open_async_icosq(c, params, cparam,
- &ccp);
- if (IS_ERR(c->async_icosq)) {
- err = PTR_ERR(c->async_icosq);
- goto err_close_rq_xdpsq_cq;
- }
- }
+ spin_lock_init(&c->async_icosq_lock);
+
+ err = mlx5e_open_icosq(c, params, &cparam->async_icosq, &c->async_icosq,
+ mlx5e_async_icosq_err_cqe_work);
+ if (err)
+ goto err_close_rq_xdpsq_cq;
mutex_init(&c->icosq_recovery_lock);
@@ -2712,8 +2671,7 @@ static int mlx5e_open_queues(struct mlx5e_channel *c,
mlx5e_close_icosq(&c->icosq);
err_close_async_icosq:
- if (c->async_icosq)
- mlx5e_close_async_icosq(c->async_icosq);
+ mlx5e_close_icosq(&c->async_icosq);
err_close_rq_xdpsq_cq:
if (c->xdp)
@@ -2732,6 +2690,9 @@ static int mlx5e_open_queues(struct mlx5e_channel *c,
err_close_icosq_cq:
mlx5e_close_cq(&c->icosq.cq);
+err_close_async_icosq_cq:
+ mlx5e_close_cq(&c->async_icosq.cq);
+
return err;
}
@@ -2745,8 +2706,7 @@ static void mlx5e_close_queues(struct mlx5e_channel *c)
mlx5e_close_sqs(c);
mlx5e_close_icosq(&c->icosq);
mutex_destroy(&c->icosq_recovery_lock);
- if (c->async_icosq)
- mlx5e_close_async_icosq(c->async_icosq);
+ mlx5e_close_icosq(&c->async_icosq);
if (c->xdp)
mlx5e_close_cq(&c->rq_xdpsq.cq);
mlx5e_close_cq(&c->rq.cq);
@@ -2754,6 +2714,7 @@ static void mlx5e_close_queues(struct mlx5e_channel *c)
mlx5e_close_xdpredirect_sq(c->xdpsq);
mlx5e_close_tx_cqs(c);
mlx5e_close_cq(&c->icosq.cq);
+ mlx5e_close_cq(&c->async_icosq.cq);
}
static u8 mlx5e_enumerate_lag_port(struct mlx5_core_dev *mdev, int ix)
@@ -2789,16 +2750,9 @@ static int mlx5e_channel_stats_alloc(struct mlx5e_priv *priv, int ix, int cpu)
void mlx5e_trigger_napi_icosq(struct mlx5e_channel *c)
{
- bool locked;
-
- if (!test_and_set_bit(MLX5E_SQ_STATE_LOCK_NEEDED, &c->icosq.state))
- synchronize_net();
-
- locked = mlx5e_icosq_sync_lock(&c->icosq);
- mlx5e_trigger_irq(&c->icosq);
- mlx5e_icosq_sync_unlock(&c->icosq, locked);
-
- clear_bit(MLX5E_SQ_STATE_LOCK_NEEDED, &c->icosq.state);
+ spin_lock_bh(&c->async_icosq_lock);
+ mlx5e_trigger_irq(&c->async_icosq);
+ spin_unlock_bh(&c->async_icosq_lock);
}
void mlx5e_trigger_napi_sched(struct napi_struct *napi)
@@ -2831,7 +2785,6 @@ static int mlx5e_open_channel(struct mlx5e_priv *priv, int ix,
struct mlx5e_channel_param *cparam;
struct mlx5_core_dev *mdev;
struct mlx5e_xsk_param xsk;
- bool async_icosq_needed;
struct mlx5e_channel *c;
unsigned int irq;
int vec_ix;
@@ -2881,8 +2834,7 @@ static int mlx5e_open_channel(struct mlx5e_priv *priv, int ix,
netif_napi_add_config_locked(netdev, &c->napi, mlx5e_napi_poll, ix);
netif_napi_set_irq_locked(&c->napi, irq);
- async_icosq_needed = !!xsk_pool || priv->ktls_rx_was_enabled;
- err = mlx5e_open_queues(c, params, cparam, async_icosq_needed);
+ err = mlx5e_open_queues(c, params, cparam);
if (unlikely(err))
goto err_napi_del;
@@ -2920,8 +2872,7 @@ static void mlx5e_activate_channel(struct mlx5e_channel *c)
for (tc = 0; tc < c->num_tc; tc++)
mlx5e_activate_txqsq(&c->sq[tc]);
mlx5e_activate_icosq(&c->icosq);
- if (c->async_icosq)
- mlx5e_activate_icosq(c->async_icosq);
+ mlx5e_activate_icosq(&c->async_icosq);
if (test_bit(MLX5E_CHANNEL_STATE_XSK, c->state))
mlx5e_activate_xsk(c);
@@ -2942,8 +2893,7 @@ static void mlx5e_deactivate_channel(struct mlx5e_channel *c)
else
mlx5e_deactivate_rq(&c->rq);
- if (c->async_icosq)
- mlx5e_deactivate_icosq(c->async_icosq);
+ mlx5e_deactivate_icosq(&c->async_icosq);
mlx5e_deactivate_icosq(&c->icosq);
for (tc = 0; tc < c->num_tc; tc++)
mlx5e_deactivate_txqsq(&c->sq[tc]);
diff --git a/drivers/net/ethernet/mellanox/mlx5/core/en_rx.c b/drivers/net/ethernet/mellanox/mlx5/core/en_rx.c
index 1fc3720d2201..1f6930c77437 100644
--- a/drivers/net/ethernet/mellanox/mlx5/core/en_rx.c
+++ b/drivers/net/ethernet/mellanox/mlx5/core/en_rx.c
@@ -778,7 +778,6 @@ static int mlx5e_alloc_rx_mpwqe(struct mlx5e_rq *rq, u16 ix)
struct mlx5_wq_cyc *wq = &sq->wq;
struct mlx5e_umr_wqe *umr_wqe;
u32 offset; /* 17-bit value with MTT. */
- bool sync_locked;
u16 pi;
int err;
int i;
@@ -789,7 +788,6 @@ static int mlx5e_alloc_rx_mpwqe(struct mlx5e_rq *rq, u16 ix)
goto err;
}
- sync_locked = mlx5e_icosq_sync_lock(sq);
pi = mlx5e_icosq_get_next_pi(sq, rq->mpwqe.umr_wqebbs);
umr_wqe = mlx5_wq_cyc_get_wqe(wq, pi);
memcpy(umr_wqe, &rq->mpwqe.umr_wqe, sizeof(struct mlx5e_umr_wqe));
@@ -837,14 +835,12 @@ static int mlx5e_alloc_rx_mpwqe(struct mlx5e_rq *rq, u16 ix)
};
sq->pc += rq->mpwqe.umr_wqebbs;
- mlx5e_icosq_sync_unlock(sq, sync_locked);
sq->doorbell_cseg = &umr_wqe->hdr.ctrl;
return 0;
err_unmap:
- mlx5e_icosq_sync_unlock(sq, sync_locked);
while (--i >= 0) {
frag_page--;
mlx5e_page_release_fragmented(rq->page_pool, frag_page);
diff --git a/drivers/net/ethernet/mellanox/mlx5/core/en_txrx.c b/drivers/net/ethernet/mellanox/mlx5/core/en_txrx.c
index b31f689fe271..76108299ea57 100644
--- a/drivers/net/ethernet/mellanox/mlx5/core/en_txrx.c
+++ b/drivers/net/ethernet/mellanox/mlx5/core/en_txrx.c
@@ -125,7 +125,6 @@ int mlx5e_napi_poll(struct napi_struct *napi, int budget)
{
struct mlx5e_channel *c = container_of(napi, struct mlx5e_channel,
napi);
- struct mlx5e_icosq *aicosq = c->async_icosq;
struct mlx5e_ch_stats *ch_stats = c->stats;
struct mlx5e_xdpsq *xsksq = &c->xsksq;
struct mlx5e_txqsq __rcu **qos_sqs;
@@ -181,18 +180,15 @@ int mlx5e_napi_poll(struct napi_struct *napi, int budget)
busy |= work_done == budget;
mlx5e_poll_ico_cq(&c->icosq.cq);
- if (aicosq) {
- if (mlx5e_poll_ico_cq(&aicosq->cq))
- /* Don't clear the flag if nothing was polled to prevent
- * queueing more WQEs and overflowing the async ICOSQ.
- */
- clear_bit(MLX5E_SQ_STATE_PENDING_XSK_TX,
- &aicosq->state);
-
- /* Keep after async ICOSQ CQ poll */
- if (unlikely(mlx5e_ktls_rx_pending_resync_list(c, budget)))
- busy |= mlx5e_ktls_rx_handle_resync_list(c, budget);
- }
+ if (mlx5e_poll_ico_cq(&c->async_icosq.cq))
+ /* Don't clear the flag if nothing was polled to prevent
+ * queueing more WQEs and overflowing the async ICOSQ.
+ */
+ clear_bit(MLX5E_SQ_STATE_PENDING_XSK_TX, &c->async_icosq.state);
+
+ /* Keep after async ICOSQ CQ poll */
+ if (unlikely(mlx5e_ktls_rx_pending_resync_list(c, budget)))
+ busy |= mlx5e_ktls_rx_handle_resync_list(c, budget);
busy |= INDIRECT_CALL_2(rq->post_wqes,
mlx5e_post_rx_mpwqes,
@@ -240,17 +236,16 @@ int mlx5e_napi_poll(struct napi_struct *napi, int budget)
mlx5e_cq_arm(&rq->cq);
mlx5e_cq_arm(&c->icosq.cq);
- if (aicosq) {
- mlx5e_cq_arm(&aicosq->cq);
- if (xsk_open) {
- mlx5e_handle_rx_dim(xskrq);
- mlx5e_cq_arm(&xsksq->cq);
- mlx5e_cq_arm(&xskrq->cq);
- }
- }
+ mlx5e_cq_arm(&c->async_icosq.cq);
if (c->xdpsq)
mlx5e_cq_arm(&c->xdpsq->cq);
+ if (xsk_open) {
+ mlx5e_handle_rx_dim(xskrq);
+ mlx5e_cq_arm(&xsksq->cq);
+ mlx5e_cq_arm(&xskrq->cq);
+ }
+
if (unlikely(aff_change && busy_xsk)) {
mlx5e_trigger_irq(&c->icosq);
ch_stats->force_irq++;
--
2.43.0
^ permalink raw reply related [flat|nested] 12+ messages in thread* Re: [PATCH net-next] net/mlx5e: Undo saving per-channel async ICOSQ 2026-01-23 22:39 [PATCH net-next] net/mlx5e: Undo saving per-channel async ICOSQ Daniel Borkmann @ 2026-01-25 8:33 ` Tariq Toukan 2026-01-26 9:23 ` Daniel Borkmann 2026-01-26 16:06 ` Alice Mikityanska 0 siblings, 2 replies; 12+ messages in thread From: Tariq Toukan @ 2026-01-25 8:33 UTC (permalink / raw) To: Daniel Borkmann, netdev Cc: William Tu, Tariq Toukan, David Wei, Jakub Kicinski, Gal Pressman On 24/01/2026 0:39, Daniel Borkmann wrote: > This reverts the following commits: > > - ea945f4f3991 ("net/mlx5e: Move async ICOSQ lock into ICOSQ struct") > - 56aca3e0f730 ("net/mlx5e: Use regular ICOSQ for triggering NAPI") > - 1b080bd74840 ("net/mlx5e: Move async ICOSQ to dynamic allocation") > - abed42f9cd80 ("net/mlx5e: Conditionally create async ICOSQ") > > There are a couple of regressions on the xsk side I ran into: > > Commit 56aca3e0f730 triggers an illegal synchronize_rcu() in an RCU read- > side critical section via mlx5e_xsk_wakeup() -> mlx5e_trigger_napi_icosq() > -> synchronize_net(). The stack holds RCU read-lock in xsk_poll(). > > Additionally, this also hits a NULL pointer dereference in mlx5e_xsk_wakeup(): > > [ 103.963735] BUG: kernel NULL pointer dereference, address: 0000000000000240 > [ 103.963743] #PF: supervisor read access in kernel mode > [ 103.963746] #PF: error_code(0x0000) - not-present page > [ 103.963749] PGD 0 P4D 0 > [ 103.963752] Oops: Oops: 0000 [#1] SMP > [ 103.963756] CPU: 0 UID: 0 PID: 2255 Comm: qemu-system-x86 Not tainted 6.19.0-rc5+ #229 PREEMPT(none) > [ 103.963761] Hardware name: [...] > [ 103.963765] RIP: 0010:mlx5e_xsk_wakeup+0x53/0x90 [mlx5_core] > > What happens is that c->async_icosq is NULL when in mlx5e_xsk_wakeup() > and therefore access to c->async_icosq->state triggers it. (On the NIC > there is an XDP program installed by the control plane where traffic > gets redirected into an xsk map - there was no xsk pool set up yet. > At some later time a xsk pool is set up and the related xsk socket is > added to the xsk map of the XDP program.) > Hi Daniel, Thanks for your report. > Reverting the series fixes the problems again. > Revert is too aggressive here. A fix is preferable. We're investigating the issue in order to fix it. We'll update. > Signed-off-by: Daniel Borkmann <daniel@iogearbox.net> > Cc: William Tu <witu@nvidia.com> > Cc: Tariq Toukan <tariqt@nvidia.com> > Cc: David Wei <dw@davidwei.uk> > Cc: Jakub Kicinski <kuba@kernel.org> > --- > drivers/net/ethernet/mellanox/mlx5/core/en.h | 26 +---- > .../mellanox/mlx5/core/en/reporter_tx.c | 1 - > .../ethernet/mellanox/mlx5/core/en/xsk/rx.c | 3 - > .../ethernet/mellanox/mlx5/core/en/xsk/tx.c | 6 +- > .../mellanox/mlx5/core/en_accel/ktls.c | 10 +- > .../mellanox/mlx5/core/en_accel/ktls_rx.c | 26 ++--- > .../mellanox/mlx5/core/en_accel/ktls_txrx.h | 3 +- > .../net/ethernet/mellanox/mlx5/core/en_main.c | 100 +++++------------- > .../net/ethernet/mellanox/mlx5/core/en_rx.c | 4 - > .../net/ethernet/mellanox/mlx5/core/en_txrx.c | 37 +++---- > 10 files changed, 62 insertions(+), 154 deletions(-) > > diff --git a/drivers/net/ethernet/mellanox/mlx5/core/en.h b/drivers/net/ethernet/mellanox/mlx5/core/en.h > index 19b9683f4622..ff4ab4691baf 100644 > --- a/drivers/net/ethernet/mellanox/mlx5/core/en.h > +++ b/drivers/net/ethernet/mellanox/mlx5/core/en.h > @@ -388,7 +388,6 @@ enum { > MLX5E_SQ_STATE_DIM, > MLX5E_SQ_STATE_PENDING_XSK_TX, > MLX5E_SQ_STATE_PENDING_TLS_RX_RESYNC, > - MLX5E_SQ_STATE_LOCK_NEEDED, > MLX5E_NUM_SQ_STATES, /* Must be kept last */ > }; > > @@ -546,11 +545,6 @@ struct mlx5e_icosq { > u32 sqn; > u16 reserved_room; > unsigned long state; > - /* icosq can be accessed from any CPU and from different contexts > - * (NAPI softirq or process/workqueue). Always use spin_lock_bh for > - * simplicity and correctness across all contexts. > - */ > - spinlock_t lock; > struct mlx5e_ktls_resync_resp *ktls_resync; > > /* control path */ > @@ -782,7 +776,9 @@ struct mlx5e_channel { > struct mlx5e_xdpsq xsksq; > > /* Async ICOSQ */ > - struct mlx5e_icosq *async_icosq; > + struct mlx5e_icosq async_icosq; > + /* async_icosq can be accessed from any CPU - the spinlock protects it. */ > + spinlock_t async_icosq_lock; > > /* data path - accessed per napi poll */ > const struct cpumask *aff_mask; > @@ -805,21 +801,6 @@ struct mlx5e_channel { > struct dim_cq_moder tx_cq_moder; > }; > > -static inline bool mlx5e_icosq_sync_lock(struct mlx5e_icosq *sq) > -{ > - if (likely(!test_bit(MLX5E_SQ_STATE_LOCK_NEEDED, &sq->state))) > - return false; > - > - spin_lock_bh(&sq->lock); > - return true; > -} > - > -static inline void mlx5e_icosq_sync_unlock(struct mlx5e_icosq *sq, bool locked) > -{ > - if (unlikely(locked)) > - spin_unlock_bh(&sq->lock); > -} > - > struct mlx5e_ptp; > > struct mlx5e_channels { > @@ -939,7 +920,6 @@ struct mlx5e_priv { > u8 max_opened_tc; > bool tx_ptp_opened; > bool rx_ptp_opened; > - bool ktls_rx_was_enabled; > struct kernel_hwtstamp_config hwtstamp_config; > u16 q_counter[MLX5_SD_MAX_GROUP_SZ]; > u16 drop_rq_q_counter; > diff --git a/drivers/net/ethernet/mellanox/mlx5/core/en/reporter_tx.c b/drivers/net/ethernet/mellanox/mlx5/core/en/reporter_tx.c > index 4adc1adf9897..9e2cf191ed30 100644 > --- a/drivers/net/ethernet/mellanox/mlx5/core/en/reporter_tx.c > +++ b/drivers/net/ethernet/mellanox/mlx5/core/en/reporter_tx.c > @@ -15,7 +15,6 @@ static const char * const sq_sw_state_type_name[] = { > [MLX5E_SQ_STATE_DIM] = "dim", > [MLX5E_SQ_STATE_PENDING_XSK_TX] = "pending_xsk_tx", > [MLX5E_SQ_STATE_PENDING_TLS_RX_RESYNC] = "pending_tls_rx_resync", > - [MLX5E_SQ_STATE_LOCK_NEEDED] = "lock_needed", > }; > > static int mlx5e_wait_for_sq_flush(struct mlx5e_txqsq *sq) > diff --git a/drivers/net/ethernet/mellanox/mlx5/core/en/xsk/rx.c b/drivers/net/ethernet/mellanox/mlx5/core/en/xsk/rx.c > index 4f984f6a2cb9..2b05536d564a 100644 > --- a/drivers/net/ethernet/mellanox/mlx5/core/en/xsk/rx.c > +++ b/drivers/net/ethernet/mellanox/mlx5/core/en/xsk/rx.c > @@ -23,7 +23,6 @@ int mlx5e_xsk_alloc_rx_mpwqe(struct mlx5e_rq *rq, u16 ix) > struct mlx5_wq_cyc *wq = &icosq->wq; > struct mlx5e_umr_wqe *umr_wqe; > struct xdp_buff **xsk_buffs; > - bool sync_locked; > int batch, i; > u32 offset; /* 17-bit value with MTT. */ > u16 pi; > @@ -48,7 +47,6 @@ int mlx5e_xsk_alloc_rx_mpwqe(struct mlx5e_rq *rq, u16 ix) > goto err_reuse_batch; > } > > - sync_locked = mlx5e_icosq_sync_lock(icosq); > pi = mlx5e_icosq_get_next_pi(icosq, rq->mpwqe.umr_wqebbs); > umr_wqe = mlx5_wq_cyc_get_wqe(wq, pi); > memcpy(umr_wqe, &rq->mpwqe.umr_wqe, sizeof(struct mlx5e_umr_wqe)); > @@ -145,7 +143,6 @@ int mlx5e_xsk_alloc_rx_mpwqe(struct mlx5e_rq *rq, u16 ix) > }; > > icosq->pc += rq->mpwqe.umr_wqebbs; > - mlx5e_icosq_sync_unlock(icosq, sync_locked); > > icosq->doorbell_cseg = &umr_wqe->hdr.ctrl; > > diff --git a/drivers/net/ethernet/mellanox/mlx5/core/en/xsk/tx.c b/drivers/net/ethernet/mellanox/mlx5/core/en/xsk/tx.c > index 9e33156fac8a..a59199ed590d 100644 > --- a/drivers/net/ethernet/mellanox/mlx5/core/en/xsk/tx.c > +++ b/drivers/net/ethernet/mellanox/mlx5/core/en/xsk/tx.c > @@ -26,12 +26,10 @@ int mlx5e_xsk_wakeup(struct net_device *dev, u32 qid, u32 flags) > * active and not polled by NAPI. Return 0, because the upcoming > * activate will trigger the IRQ for us. > */ > - if (unlikely(!test_bit(MLX5E_SQ_STATE_ENABLED, > - &c->async_icosq->state))) > + if (unlikely(!test_bit(MLX5E_SQ_STATE_ENABLED, &c->async_icosq.state))) > return 0; > > - if (test_and_set_bit(MLX5E_SQ_STATE_PENDING_XSK_TX, > - &c->async_icosq->state)) > + if (test_and_set_bit(MLX5E_SQ_STATE_PENDING_XSK_TX, &c->async_icosq.state)) > return 0; > > mlx5e_trigger_napi_icosq(c); > diff --git a/drivers/net/ethernet/mellanox/mlx5/core/en_accel/ktls.c b/drivers/net/ethernet/mellanox/mlx5/core/en_accel/ktls.c > index 1c2cc2aad2b0..e3e57c849436 100644 > --- a/drivers/net/ethernet/mellanox/mlx5/core/en_accel/ktls.c > +++ b/drivers/net/ethernet/mellanox/mlx5/core/en_accel/ktls.c > @@ -135,15 +135,10 @@ int mlx5e_ktls_set_feature_rx(struct net_device *netdev, bool enable) > int err = 0; > > mutex_lock(&priv->state_lock); > - if (enable) { > + if (enable) > err = mlx5e_accel_fs_tcp_create(priv->fs); > - if (!err && !priv->ktls_rx_was_enabled) { > - priv->ktls_rx_was_enabled = true; > - mlx5e_safe_reopen_channels(priv); > - } > - } else { > + else > mlx5e_accel_fs_tcp_destroy(priv->fs); > - } > mutex_unlock(&priv->state_lock); > > return err; > @@ -166,7 +161,6 @@ int mlx5e_ktls_init_rx(struct mlx5e_priv *priv) > destroy_workqueue(priv->tls->rx_wq); > return err; > } > - priv->ktls_rx_was_enabled = true; > } > > return 0; > diff --git a/drivers/net/ethernet/mellanox/mlx5/core/en_accel/ktls_rx.c b/drivers/net/ethernet/mellanox/mlx5/core/en_accel/ktls_rx.c > index 5d8fe252799e..da2d1eb52c13 100644 > --- a/drivers/net/ethernet/mellanox/mlx5/core/en_accel/ktls_rx.c > +++ b/drivers/net/ethernet/mellanox/mlx5/core/en_accel/ktls_rx.c > @@ -202,8 +202,8 @@ static int post_rx_param_wqes(struct mlx5e_channel *c, > int err; > > err = 0; > - sq = c->async_icosq; > - spin_lock_bh(&sq->lock); > + sq = &c->async_icosq; > + spin_lock_bh(&c->async_icosq_lock); > > cseg = post_static_params(sq, priv_rx); > if (IS_ERR(cseg)) > @@ -214,7 +214,7 @@ static int post_rx_param_wqes(struct mlx5e_channel *c, > > mlx5e_notify_hw(&sq->wq, sq->pc, sq->uar_map, cseg); > unlock: > - spin_unlock_bh(&sq->lock); > + spin_unlock_bh(&c->async_icosq_lock); > > return err; > > @@ -277,10 +277,10 @@ resync_post_get_progress_params(struct mlx5e_icosq *sq, > > buf->priv_rx = priv_rx; > > - spin_lock_bh(&sq->lock); > + spin_lock_bh(&sq->channel->async_icosq_lock); > > if (unlikely(!mlx5e_icosq_can_post_wqe(sq, MLX5E_KTLS_GET_PROGRESS_WQEBBS))) { > - spin_unlock_bh(&sq->lock); > + spin_unlock_bh(&sq->channel->async_icosq_lock); > err = -ENOSPC; > goto err_dma_unmap; > } > @@ -311,7 +311,7 @@ resync_post_get_progress_params(struct mlx5e_icosq *sq, > icosq_fill_wi(sq, pi, &wi); > sq->pc++; > mlx5e_notify_hw(&sq->wq, sq->pc, sq->uar_map, cseg); > - spin_unlock_bh(&sq->lock); > + spin_unlock_bh(&sq->channel->async_icosq_lock); > > return 0; > > @@ -344,7 +344,7 @@ static void resync_handle_work(struct work_struct *work) > } > > c = resync->priv->channels.c[priv_rx->rxq]; > - sq = c->async_icosq; > + sq = &c->async_icosq; > > if (resync_post_get_progress_params(sq, priv_rx)) { > priv_rx->rq_stats->tls_resync_req_skip++; > @@ -371,7 +371,7 @@ static void resync_handle_seq_match(struct mlx5e_ktls_offload_context_rx *priv_r > struct mlx5e_icosq *sq; > bool trigger_poll; > > - sq = c->async_icosq; > + sq = &c->async_icosq; > ktls_resync = sq->ktls_resync; > trigger_poll = false; > > @@ -413,9 +413,9 @@ static void resync_handle_seq_match(struct mlx5e_ktls_offload_context_rx *priv_r > return; > > if (!napi_if_scheduled_mark_missed(&c->napi)) { > - spin_lock_bh(&sq->lock); > + spin_lock_bh(&c->async_icosq_lock); > mlx5e_trigger_irq(sq); > - spin_unlock_bh(&sq->lock); > + spin_unlock_bh(&c->async_icosq_lock); > } > } > > @@ -753,7 +753,7 @@ bool mlx5e_ktls_rx_handle_resync_list(struct mlx5e_channel *c, int budget) > LIST_HEAD(local_list); > int i, j; > > - sq = c->async_icosq; > + sq = &c->async_icosq; > > if (unlikely(!test_bit(MLX5E_SQ_STATE_ENABLED, &sq->state))) > return false; > @@ -772,7 +772,7 @@ bool mlx5e_ktls_rx_handle_resync_list(struct mlx5e_channel *c, int budget) > clear_bit(MLX5E_SQ_STATE_PENDING_TLS_RX_RESYNC, &sq->state); > spin_unlock(&ktls_resync->lock); > > - spin_lock(&sq->lock); > + spin_lock(&c->async_icosq_lock); > for (j = 0; j < i; j++) { > struct mlx5_wqe_ctrl_seg *cseg; > > @@ -791,7 +791,7 @@ bool mlx5e_ktls_rx_handle_resync_list(struct mlx5e_channel *c, int budget) > } > if (db_cseg) > mlx5e_notify_hw(&sq->wq, sq->pc, sq->uar_map, db_cseg); > - spin_unlock(&sq->lock); > + spin_unlock(&c->async_icosq_lock); > > priv_rx->rq_stats->tls_resync_res_ok += j; > > diff --git a/drivers/net/ethernet/mellanox/mlx5/core/en_accel/ktls_txrx.h b/drivers/net/ethernet/mellanox/mlx5/core/en_accel/ktls_txrx.h > index 4022c7e78a2e..cb08799769ee 100644 > --- a/drivers/net/ethernet/mellanox/mlx5/core/en_accel/ktls_txrx.h > +++ b/drivers/net/ethernet/mellanox/mlx5/core/en_accel/ktls_txrx.h > @@ -50,8 +50,7 @@ bool mlx5e_ktls_rx_handle_resync_list(struct mlx5e_channel *c, int budget); > static inline bool > mlx5e_ktls_rx_pending_resync_list(struct mlx5e_channel *c, int budget) > { > - return budget && test_bit(MLX5E_SQ_STATE_PENDING_TLS_RX_RESYNC, > - &c->async_icosq->state); > + return budget && test_bit(MLX5E_SQ_STATE_PENDING_TLS_RX_RESYNC, &c->async_icosq.state); > } > > static inline void > diff --git a/drivers/net/ethernet/mellanox/mlx5/core/en_main.c b/drivers/net/ethernet/mellanox/mlx5/core/en_main.c > index e1e05c9e7ebb..446510153e5e 100644 > --- a/drivers/net/ethernet/mellanox/mlx5/core/en_main.c > +++ b/drivers/net/ethernet/mellanox/mlx5/core/en_main.c > @@ -2075,8 +2075,6 @@ static int mlx5e_open_icosq(struct mlx5e_channel *c, struct mlx5e_params *params > if (err) > goto err_free_icosq; > > - spin_lock_init(&sq->lock); > - > if (param->is_tls) { > sq->ktls_resync = mlx5e_ktls_rx_resync_create_resp_list(); > if (IS_ERR(sq->ktls_resync)) { > @@ -2589,51 +2587,9 @@ static int mlx5e_open_rxq_rq(struct mlx5e_channel *c, struct mlx5e_params *param > return mlx5e_open_rq(params, rq_params, NULL, cpu_to_node(c->cpu), q_counter, &c->rq); > } > > -static struct mlx5e_icosq * > -mlx5e_open_async_icosq(struct mlx5e_channel *c, > - struct mlx5e_params *params, > - struct mlx5e_channel_param *cparam, > - struct mlx5e_create_cq_param *ccp) > -{ > - struct dim_cq_moder icocq_moder = {0, 0}; > - struct mlx5e_icosq *async_icosq; > - int err; > - > - async_icosq = kvzalloc_node(sizeof(*async_icosq), GFP_KERNEL, > - cpu_to_node(c->cpu)); > - if (!async_icosq) > - return ERR_PTR(-ENOMEM); > - > - err = mlx5e_open_cq(c->mdev, icocq_moder, &cparam->async_icosq.cqp, ccp, > - &async_icosq->cq); > - if (err) > - goto err_free_async_icosq; > - > - err = mlx5e_open_icosq(c, params, &cparam->async_icosq, async_icosq, > - mlx5e_async_icosq_err_cqe_work); > - if (err) > - goto err_close_async_icosq_cq; > - > - return async_icosq; > - > -err_close_async_icosq_cq: > - mlx5e_close_cq(&async_icosq->cq); > -err_free_async_icosq: > - kvfree(async_icosq); > - return ERR_PTR(err); > -} > - > -static void mlx5e_close_async_icosq(struct mlx5e_icosq *async_icosq) > -{ > - mlx5e_close_icosq(async_icosq); > - mlx5e_close_cq(&async_icosq->cq); > - kvfree(async_icosq); > -} > - > static int mlx5e_open_queues(struct mlx5e_channel *c, > struct mlx5e_params *params, > - struct mlx5e_channel_param *cparam, > - bool async_icosq_needed) > + struct mlx5e_channel_param *cparam) > { > const struct net_device_ops *netdev_ops = c->netdev->netdev_ops; > struct dim_cq_moder icocq_moder = {0, 0}; > @@ -2642,10 +2598,15 @@ static int mlx5e_open_queues(struct mlx5e_channel *c, > > mlx5e_build_create_cq_param(&ccp, c); > > + err = mlx5e_open_cq(c->mdev, icocq_moder, &cparam->async_icosq.cqp, &ccp, > + &c->async_icosq.cq); > + if (err) > + return err; > + > err = mlx5e_open_cq(c->mdev, icocq_moder, &cparam->icosq.cqp, &ccp, > &c->icosq.cq); > if (err) > - return err; > + goto err_close_async_icosq_cq; > > err = mlx5e_open_tx_cqs(c, params, &ccp, cparam); > if (err) > @@ -2669,14 +2630,12 @@ static int mlx5e_open_queues(struct mlx5e_channel *c, > if (err) > goto err_close_rx_cq; > > - if (async_icosq_needed) { > - c->async_icosq = mlx5e_open_async_icosq(c, params, cparam, > - &ccp); > - if (IS_ERR(c->async_icosq)) { > - err = PTR_ERR(c->async_icosq); > - goto err_close_rq_xdpsq_cq; > - } > - } > + spin_lock_init(&c->async_icosq_lock); > + > + err = mlx5e_open_icosq(c, params, &cparam->async_icosq, &c->async_icosq, > + mlx5e_async_icosq_err_cqe_work); > + if (err) > + goto err_close_rq_xdpsq_cq; > > mutex_init(&c->icosq_recovery_lock); > > @@ -2712,8 +2671,7 @@ static int mlx5e_open_queues(struct mlx5e_channel *c, > mlx5e_close_icosq(&c->icosq); > > err_close_async_icosq: > - if (c->async_icosq) > - mlx5e_close_async_icosq(c->async_icosq); > + mlx5e_close_icosq(&c->async_icosq); > > err_close_rq_xdpsq_cq: > if (c->xdp) > @@ -2732,6 +2690,9 @@ static int mlx5e_open_queues(struct mlx5e_channel *c, > err_close_icosq_cq: > mlx5e_close_cq(&c->icosq.cq); > > +err_close_async_icosq_cq: > + mlx5e_close_cq(&c->async_icosq.cq); > + > return err; > } > > @@ -2745,8 +2706,7 @@ static void mlx5e_close_queues(struct mlx5e_channel *c) > mlx5e_close_sqs(c); > mlx5e_close_icosq(&c->icosq); > mutex_destroy(&c->icosq_recovery_lock); > - if (c->async_icosq) > - mlx5e_close_async_icosq(c->async_icosq); > + mlx5e_close_icosq(&c->async_icosq); > if (c->xdp) > mlx5e_close_cq(&c->rq_xdpsq.cq); > mlx5e_close_cq(&c->rq.cq); > @@ -2754,6 +2714,7 @@ static void mlx5e_close_queues(struct mlx5e_channel *c) > mlx5e_close_xdpredirect_sq(c->xdpsq); > mlx5e_close_tx_cqs(c); > mlx5e_close_cq(&c->icosq.cq); > + mlx5e_close_cq(&c->async_icosq.cq); > } > > static u8 mlx5e_enumerate_lag_port(struct mlx5_core_dev *mdev, int ix) > @@ -2789,16 +2750,9 @@ static int mlx5e_channel_stats_alloc(struct mlx5e_priv *priv, int ix, int cpu) > > void mlx5e_trigger_napi_icosq(struct mlx5e_channel *c) > { > - bool locked; > - > - if (!test_and_set_bit(MLX5E_SQ_STATE_LOCK_NEEDED, &c->icosq.state)) > - synchronize_net(); > - > - locked = mlx5e_icosq_sync_lock(&c->icosq); > - mlx5e_trigger_irq(&c->icosq); > - mlx5e_icosq_sync_unlock(&c->icosq, locked); > - > - clear_bit(MLX5E_SQ_STATE_LOCK_NEEDED, &c->icosq.state); > + spin_lock_bh(&c->async_icosq_lock); > + mlx5e_trigger_irq(&c->async_icosq); > + spin_unlock_bh(&c->async_icosq_lock); > } > > void mlx5e_trigger_napi_sched(struct napi_struct *napi) > @@ -2831,7 +2785,6 @@ static int mlx5e_open_channel(struct mlx5e_priv *priv, int ix, > struct mlx5e_channel_param *cparam; > struct mlx5_core_dev *mdev; > struct mlx5e_xsk_param xsk; > - bool async_icosq_needed; > struct mlx5e_channel *c; > unsigned int irq; > int vec_ix; > @@ -2881,8 +2834,7 @@ static int mlx5e_open_channel(struct mlx5e_priv *priv, int ix, > netif_napi_add_config_locked(netdev, &c->napi, mlx5e_napi_poll, ix); > netif_napi_set_irq_locked(&c->napi, irq); > > - async_icosq_needed = !!xsk_pool || priv->ktls_rx_was_enabled; > - err = mlx5e_open_queues(c, params, cparam, async_icosq_needed); > + err = mlx5e_open_queues(c, params, cparam); > if (unlikely(err)) > goto err_napi_del; > > @@ -2920,8 +2872,7 @@ static void mlx5e_activate_channel(struct mlx5e_channel *c) > for (tc = 0; tc < c->num_tc; tc++) > mlx5e_activate_txqsq(&c->sq[tc]); > mlx5e_activate_icosq(&c->icosq); > - if (c->async_icosq) > - mlx5e_activate_icosq(c->async_icosq); > + mlx5e_activate_icosq(&c->async_icosq); > > if (test_bit(MLX5E_CHANNEL_STATE_XSK, c->state)) > mlx5e_activate_xsk(c); > @@ -2942,8 +2893,7 @@ static void mlx5e_deactivate_channel(struct mlx5e_channel *c) > else > mlx5e_deactivate_rq(&c->rq); > > - if (c->async_icosq) > - mlx5e_deactivate_icosq(c->async_icosq); > + mlx5e_deactivate_icosq(&c->async_icosq); > mlx5e_deactivate_icosq(&c->icosq); > for (tc = 0; tc < c->num_tc; tc++) > mlx5e_deactivate_txqsq(&c->sq[tc]); > diff --git a/drivers/net/ethernet/mellanox/mlx5/core/en_rx.c b/drivers/net/ethernet/mellanox/mlx5/core/en_rx.c > index 1fc3720d2201..1f6930c77437 100644 > --- a/drivers/net/ethernet/mellanox/mlx5/core/en_rx.c > +++ b/drivers/net/ethernet/mellanox/mlx5/core/en_rx.c > @@ -778,7 +778,6 @@ static int mlx5e_alloc_rx_mpwqe(struct mlx5e_rq *rq, u16 ix) > struct mlx5_wq_cyc *wq = &sq->wq; > struct mlx5e_umr_wqe *umr_wqe; > u32 offset; /* 17-bit value with MTT. */ > - bool sync_locked; > u16 pi; > int err; > int i; > @@ -789,7 +788,6 @@ static int mlx5e_alloc_rx_mpwqe(struct mlx5e_rq *rq, u16 ix) > goto err; > } > > - sync_locked = mlx5e_icosq_sync_lock(sq); > pi = mlx5e_icosq_get_next_pi(sq, rq->mpwqe.umr_wqebbs); > umr_wqe = mlx5_wq_cyc_get_wqe(wq, pi); > memcpy(umr_wqe, &rq->mpwqe.umr_wqe, sizeof(struct mlx5e_umr_wqe)); > @@ -837,14 +835,12 @@ static int mlx5e_alloc_rx_mpwqe(struct mlx5e_rq *rq, u16 ix) > }; > > sq->pc += rq->mpwqe.umr_wqebbs; > - mlx5e_icosq_sync_unlock(sq, sync_locked); > > sq->doorbell_cseg = &umr_wqe->hdr.ctrl; > > return 0; > > err_unmap: > - mlx5e_icosq_sync_unlock(sq, sync_locked); > while (--i >= 0) { > frag_page--; > mlx5e_page_release_fragmented(rq->page_pool, frag_page); > diff --git a/drivers/net/ethernet/mellanox/mlx5/core/en_txrx.c b/drivers/net/ethernet/mellanox/mlx5/core/en_txrx.c > index b31f689fe271..76108299ea57 100644 > --- a/drivers/net/ethernet/mellanox/mlx5/core/en_txrx.c > +++ b/drivers/net/ethernet/mellanox/mlx5/core/en_txrx.c > @@ -125,7 +125,6 @@ int mlx5e_napi_poll(struct napi_struct *napi, int budget) > { > struct mlx5e_channel *c = container_of(napi, struct mlx5e_channel, > napi); > - struct mlx5e_icosq *aicosq = c->async_icosq; > struct mlx5e_ch_stats *ch_stats = c->stats; > struct mlx5e_xdpsq *xsksq = &c->xsksq; > struct mlx5e_txqsq __rcu **qos_sqs; > @@ -181,18 +180,15 @@ int mlx5e_napi_poll(struct napi_struct *napi, int budget) > busy |= work_done == budget; > > mlx5e_poll_ico_cq(&c->icosq.cq); > - if (aicosq) { > - if (mlx5e_poll_ico_cq(&aicosq->cq)) > - /* Don't clear the flag if nothing was polled to prevent > - * queueing more WQEs and overflowing the async ICOSQ. > - */ > - clear_bit(MLX5E_SQ_STATE_PENDING_XSK_TX, > - &aicosq->state); > - > - /* Keep after async ICOSQ CQ poll */ > - if (unlikely(mlx5e_ktls_rx_pending_resync_list(c, budget))) > - busy |= mlx5e_ktls_rx_handle_resync_list(c, budget); > - } > + if (mlx5e_poll_ico_cq(&c->async_icosq.cq)) > + /* Don't clear the flag if nothing was polled to prevent > + * queueing more WQEs and overflowing the async ICOSQ. > + */ > + clear_bit(MLX5E_SQ_STATE_PENDING_XSK_TX, &c->async_icosq.state); > + > + /* Keep after async ICOSQ CQ poll */ > + if (unlikely(mlx5e_ktls_rx_pending_resync_list(c, budget))) > + busy |= mlx5e_ktls_rx_handle_resync_list(c, budget); > > busy |= INDIRECT_CALL_2(rq->post_wqes, > mlx5e_post_rx_mpwqes, > @@ -240,17 +236,16 @@ int mlx5e_napi_poll(struct napi_struct *napi, int budget) > > mlx5e_cq_arm(&rq->cq); > mlx5e_cq_arm(&c->icosq.cq); > - if (aicosq) { > - mlx5e_cq_arm(&aicosq->cq); > - if (xsk_open) { > - mlx5e_handle_rx_dim(xskrq); > - mlx5e_cq_arm(&xsksq->cq); > - mlx5e_cq_arm(&xskrq->cq); > - } > - } > + mlx5e_cq_arm(&c->async_icosq.cq); > if (c->xdpsq) > mlx5e_cq_arm(&c->xdpsq->cq); > > + if (xsk_open) { > + mlx5e_handle_rx_dim(xskrq); > + mlx5e_cq_arm(&xsksq->cq); > + mlx5e_cq_arm(&xskrq->cq); > + } > + > if (unlikely(aff_change && busy_xsk)) { > mlx5e_trigger_irq(&c->icosq); > ch_stats->force_irq++; ^ permalink raw reply [flat|nested] 12+ messages in thread
* Re: [PATCH net-next] net/mlx5e: Undo saving per-channel async ICOSQ 2026-01-25 8:33 ` Tariq Toukan @ 2026-01-26 9:23 ` Daniel Borkmann 2026-02-01 12:50 ` Tariq Toukan 2026-01-26 16:06 ` Alice Mikityanska 1 sibling, 1 reply; 12+ messages in thread From: Daniel Borkmann @ 2026-01-26 9:23 UTC (permalink / raw) To: Tariq Toukan, netdev Cc: William Tu, Tariq Toukan, David Wei, Jakub Kicinski, Gal Pressman Hi Tariq, On 1/25/26 9:33 AM, Tariq Toukan wrote: > On 24/01/2026 0:39, Daniel Borkmann wrote: >> This reverts the following commits: >> >> - ea945f4f3991 ("net/mlx5e: Move async ICOSQ lock into ICOSQ struct") >> - 56aca3e0f730 ("net/mlx5e: Use regular ICOSQ for triggering NAPI") >> - 1b080bd74840 ("net/mlx5e: Move async ICOSQ to dynamic allocation") >> - abed42f9cd80 ("net/mlx5e: Conditionally create async ICOSQ") >> >> There are a couple of regressions on the xsk side I ran into: >> >> Commit 56aca3e0f730 triggers an illegal synchronize_rcu() in an RCU read- >> side critical section via mlx5e_xsk_wakeup() -> mlx5e_trigger_napi_icosq() >> -> synchronize_net(). The stack holds RCU read-lock in xsk_poll(). >> >> Additionally, this also hits a NULL pointer dereference in mlx5e_xsk_wakeup(): >> >> [ 103.963735] BUG: kernel NULL pointer dereference, address: 0000000000000240 >> [ 103.963743] #PF: supervisor read access in kernel mode >> [ 103.963746] #PF: error_code(0x0000) - not-present page >> [ 103.963749] PGD 0 P4D 0 >> [ 103.963752] Oops: Oops: 0000 [#1] SMP >> [ 103.963756] CPU: 0 UID: 0 PID: 2255 Comm: qemu-system-x86 Not tainted 6.19.0-rc5+ #229 PREEMPT(none) >> [ 103.963761] Hardware name: [...] >> [ 103.963765] RIP: 0010:mlx5e_xsk_wakeup+0x53/0x90 [mlx5_core] >> >> What happens is that c->async_icosq is NULL when in mlx5e_xsk_wakeup() >> and therefore access to c->async_icosq->state triggers it. (On the NIC >> there is an XDP program installed by the control plane where traffic >> gets redirected into an xsk map - there was no xsk pool set up yet. >> At some later time a xsk pool is set up and the related xsk socket is >> added to the xsk map of the XDP program.) > > Thanks for your report. > >> Reverting the series fixes the problems again. > > Revert is too aggressive here. A fix is preferable. > We're investigating the issue in order to fix it. > We'll update. Ok, sounds good. Certainly the kTLS fixes seem independent, from the cause of the issues I've hit it just seemed to me that they were quite fundamental and that perhaps a different approach would be needed (or alternatively only kTLS would need fixing, and the xsk optimization left as it was originally). Anyway, I'll keep the revert locally for now, and happy to test patches. ^ permalink raw reply [flat|nested] 12+ messages in thread
* Re: [PATCH net-next] net/mlx5e: Undo saving per-channel async ICOSQ 2026-01-26 9:23 ` Daniel Borkmann @ 2026-02-01 12:50 ` Tariq Toukan 2026-02-02 16:13 ` Daniel Borkmann 0 siblings, 1 reply; 12+ messages in thread From: Tariq Toukan @ 2026-02-01 12:50 UTC (permalink / raw) To: Daniel Borkmann, netdev Cc: William Tu, Tariq Toukan, David Wei, Jakub Kicinski, Gal Pressman [-- Attachment #1: Type: text/plain, Size: 2715 bytes --] On 26/01/2026 11:23, Daniel Borkmann wrote: > Hi Tariq, > > On 1/25/26 9:33 AM, Tariq Toukan wrote: >> On 24/01/2026 0:39, Daniel Borkmann wrote: >>> This reverts the following commits: >>> >>> - ea945f4f3991 ("net/mlx5e: Move async ICOSQ lock into ICOSQ struct") >>> - 56aca3e0f730 ("net/mlx5e: Use regular ICOSQ for triggering NAPI") >>> - 1b080bd74840 ("net/mlx5e: Move async ICOSQ to dynamic allocation") >>> - abed42f9cd80 ("net/mlx5e: Conditionally create async ICOSQ") >>> >>> There are a couple of regressions on the xsk side I ran into: >>> >>> Commit 56aca3e0f730 triggers an illegal synchronize_rcu() in an RCU >>> read- >>> side critical section via mlx5e_xsk_wakeup() -> >>> mlx5e_trigger_napi_icosq() >>> -> synchronize_net(). The stack holds RCU read-lock in xsk_poll(). >>> >>> Additionally, this also hits a NULL pointer dereference in >>> mlx5e_xsk_wakeup(): >>> >>> [ 103.963735] BUG: kernel NULL pointer dereference, address: >>> 0000000000000240 >>> [ 103.963743] #PF: supervisor read access in kernel mode >>> [ 103.963746] #PF: error_code(0x0000) - not-present page >>> [ 103.963749] PGD 0 P4D 0 >>> [ 103.963752] Oops: Oops: 0000 [#1] SMP >>> [ 103.963756] CPU: 0 UID: 0 PID: 2255 Comm: qemu-system-x86 Not >>> tainted 6.19.0-rc5+ #229 PREEMPT(none) >>> [ 103.963761] Hardware name: [...] >>> [ 103.963765] RIP: 0010:mlx5e_xsk_wakeup+0x53/0x90 [mlx5_core] >>> >>> What happens is that c->async_icosq is NULL when in mlx5e_xsk_wakeup() >>> and therefore access to c->async_icosq->state triggers it. (On the NIC >>> there is an XDP program installed by the control plane where traffic >>> gets redirected into an xsk map - there was no xsk pool set up yet. >>> At some later time a xsk pool is set up and the related xsk socket is >>> added to the xsk map of the XDP program.) >> >> Thanks for your report. >> >>> Reverting the series fixes the problems again. >> >> Revert is too aggressive here. A fix is preferable. >> We're investigating the issue in order to fix it. >> We'll update. > Ok, sounds good. Certainly the kTLS fixes seem independent, from the cause > of the issues I've hit it just seemed to me that they were quite > fundamental > and that perhaps a different approach would be needed (or alternatively > only > kTLS would need fixing, and the xsk optimization left as it was > originally). > Anyway, I'll keep the revert locally for now, and happy to test patches. Hi Daniel, Please check attached patch. We were able to repro the issues internally and verify the fix. We're finalizing it before submission. I'd be glad if you can confirm it solves the issues for you. Thanks, Tariq [-- Attachment #2: 0001-net-mlx5e-XSK-Fix-unintended-ICOSQ-change.patch --] [-- Type: text/plain, Size: 4503 bytes --] From fd51958ad083528859a171a84aeba7021989872b Mon Sep 17 00:00:00 2001 From: Tariq Toukan <tariqt@nvidia.com> Date: Mon, 26 Jan 2026 17:07:29 +0200 Subject: [PATCH] net/mlx5e: XSK, Fix unintended ICOSQ change Change-Id: Iffaa183f9ed8a9190c38be2c5290b44bf6ef731a Signed-off-by: Tariq Toukan <tariqt@nvidia.com> --- drivers/net/ethernet/mellanox/mlx5/core/en.h | 1 + .../ethernet/mellanox/mlx5/core/en/xsk/pool.c | 4 +-- .../ethernet/mellanox/mlx5/core/en/xsk/tx.c | 2 +- .../net/ethernet/mellanox/mlx5/core/en_main.c | 28 ++++++++++++++----- 4 files changed, 25 insertions(+), 10 deletions(-) diff --git a/drivers/net/ethernet/mellanox/mlx5/core/en.h b/drivers/net/ethernet/mellanox/mlx5/core/en.h index 19b9683f4622..63da9f680e87 100644 --- a/drivers/net/ethernet/mellanox/mlx5/core/en.h +++ b/drivers/net/ethernet/mellanox/mlx5/core/en.h @@ -1109,6 +1109,7 @@ int mlx5e_open_locked(struct net_device *netdev); int mlx5e_close_locked(struct net_device *netdev); void mlx5e_trigger_napi_icosq(struct mlx5e_channel *c); +void mlx5e_trigger_napi_async_icosq(struct mlx5e_channel *c); void mlx5e_trigger_napi_sched(struct napi_struct *napi); int mlx5e_open_channels(struct mlx5e_priv *priv, diff --git a/drivers/net/ethernet/mellanox/mlx5/core/en/xsk/pool.c b/drivers/net/ethernet/mellanox/mlx5/core/en/xsk/pool.c index db776e515b6a..5c5360a25c64 100644 --- a/drivers/net/ethernet/mellanox/mlx5/core/en/xsk/pool.c +++ b/drivers/net/ethernet/mellanox/mlx5/core/en/xsk/pool.c @@ -127,7 +127,7 @@ static int mlx5e_xsk_enable_locked(struct mlx5e_priv *priv, goto err_remove_pool; mlx5e_activate_xsk(c); - mlx5e_trigger_napi_icosq(c); + mlx5e_trigger_napi_async_icosq(c); /* Don't wait for WQEs, because the newer xdpsock sample doesn't provide * any Fill Ring entries at the setup stage. @@ -179,7 +179,7 @@ static int mlx5e_xsk_disable_locked(struct mlx5e_priv *priv, u16 ix) c = priv->channels.c[ix]; mlx5e_activate_rq(&c->rq); - mlx5e_trigger_napi_icosq(c); + mlx5e_trigger_napi_async_icosq(c); mlx5e_wait_for_min_rx_wqes(&c->rq, MLX5E_RQ_WQES_TIMEOUT); mlx5e_rx_res_xsk_update(priv->rx_res, &priv->channels, ix, false); diff --git a/drivers/net/ethernet/mellanox/mlx5/core/en/xsk/tx.c b/drivers/net/ethernet/mellanox/mlx5/core/en/xsk/tx.c index 9e33156fac8a..8aeab4b21035 100644 --- a/drivers/net/ethernet/mellanox/mlx5/core/en/xsk/tx.c +++ b/drivers/net/ethernet/mellanox/mlx5/core/en/xsk/tx.c @@ -34,7 +34,7 @@ int mlx5e_xsk_wakeup(struct net_device *dev, u32 qid, u32 flags) &c->async_icosq->state)) return 0; - mlx5e_trigger_napi_icosq(c); + mlx5e_trigger_napi_async_icosq(c); } return 0; diff --git a/drivers/net/ethernet/mellanox/mlx5/core/en_main.c b/drivers/net/ethernet/mellanox/mlx5/core/en_main.c index e1e05c9e7ebb..4a6887ba5741 100644 --- a/drivers/net/ethernet/mellanox/mlx5/core/en_main.c +++ b/drivers/net/ethernet/mellanox/mlx5/core/en_main.c @@ -2789,16 +2789,30 @@ static int mlx5e_channel_stats_alloc(struct mlx5e_priv *priv, int ix, int cpu) void mlx5e_trigger_napi_icosq(struct mlx5e_channel *c) { + struct mlx5e_icosq *sq = &c->icosq; bool locked; - if (!test_and_set_bit(MLX5E_SQ_STATE_LOCK_NEEDED, &c->icosq.state)) - synchronize_net(); + WARN_ON(test_bit(MLX5E_SQ_STATE_LOCK_NEEDED, &sq->state)); - locked = mlx5e_icosq_sync_lock(&c->icosq); - mlx5e_trigger_irq(&c->icosq); - mlx5e_icosq_sync_unlock(&c->icosq, locked); + set_bit(MLX5E_SQ_STATE_LOCK_NEEDED, &sq->state); + synchronize_net(); - clear_bit(MLX5E_SQ_STATE_LOCK_NEEDED, &c->icosq.state); + locked = mlx5e_icosq_sync_lock(sq); + mlx5e_trigger_irq(sq); + mlx5e_icosq_sync_unlock(sq, locked); + + clear_bit(MLX5E_SQ_STATE_LOCK_NEEDED, &sq->state); +} + +void mlx5e_trigger_napi_async_icosq(struct mlx5e_channel *c) +{ + struct mlx5e_icosq *sq = c->async_icosq; + + WARN_ON(!sq); + + spin_lock_bh(&sq->lock); + mlx5e_trigger_irq(sq); + spin_unlock_bh(&sq->lock); } void mlx5e_trigger_napi_sched(struct napi_struct *napi) @@ -2881,7 +2895,7 @@ static int mlx5e_open_channel(struct mlx5e_priv *priv, int ix, netif_napi_add_config_locked(netdev, &c->napi, mlx5e_napi_poll, ix); netif_napi_set_irq_locked(&c->napi, irq); - async_icosq_needed = !!xsk_pool || priv->ktls_rx_was_enabled; + async_icosq_needed = !!params->xdp_prog || priv->ktls_rx_was_enabled; err = mlx5e_open_queues(c, params, cparam, async_icosq_needed); if (unlikely(err)) goto err_napi_del; base-commit: d8f87aa5fa0a4276491fa8ef436cd22605a3f9ba -- 2.44.0 ^ permalink raw reply related [flat|nested] 12+ messages in thread
* Re: [PATCH net-next] net/mlx5e: Undo saving per-channel async ICOSQ 2026-02-01 12:50 ` Tariq Toukan @ 2026-02-02 16:13 ` Daniel Borkmann 2026-02-03 9:20 ` Alice Mikityanska 2026-02-03 12:30 ` Tariq Toukan 0 siblings, 2 replies; 12+ messages in thread From: Daniel Borkmann @ 2026-02-02 16:13 UTC (permalink / raw) To: Tariq Toukan, netdev Cc: William Tu, Tariq Toukan, David Wei, Jakub Kicinski, Gal Pressman, alice.kernel Hi Tariq, On 2/1/26 1:50 PM, Tariq Toukan wrote: > On 26/01/2026 11:23, Daniel Borkmann wrote: >> On 1/25/26 9:33 AM, Tariq Toukan wrote: >>> On 24/01/2026 0:39, Daniel Borkmann wrote: >>>> This reverts the following commits: >>>> >>>> - ea945f4f3991 ("net/mlx5e: Move async ICOSQ lock into ICOSQ struct") >>>> - 56aca3e0f730 ("net/mlx5e: Use regular ICOSQ for triggering NAPI") >>>> - 1b080bd74840 ("net/mlx5e: Move async ICOSQ to dynamic allocation") >>>> - abed42f9cd80 ("net/mlx5e: Conditionally create async ICOSQ") >>>> >>>> There are a couple of regressions on the xsk side I ran into: >>>> >>>> Commit 56aca3e0f730 triggers an illegal synchronize_rcu() in an RCU read- >>>> side critical section via mlx5e_xsk_wakeup() -> mlx5e_trigger_napi_icosq() >>>> -> synchronize_net(). The stack holds RCU read-lock in xsk_poll(). >>>> >>>> Additionally, this also hits a NULL pointer dereference in mlx5e_xsk_wakeup(): >>>> >>>> [ 103.963735] BUG: kernel NULL pointer dereference, address: 0000000000000240 >>>> [ 103.963743] #PF: supervisor read access in kernel mode >>>> [ 103.963746] #PF: error_code(0x0000) - not-present page >>>> [ 103.963749] PGD 0 P4D 0 >>>> [ 103.963752] Oops: Oops: 0000 [#1] SMP >>>> [ 103.963756] CPU: 0 UID: 0 PID: 2255 Comm: qemu-system-x86 Not tainted 6.19.0-rc5+ #229 PREEMPT(none) >>>> [ 103.963761] Hardware name: [...] >>>> [ 103.963765] RIP: 0010:mlx5e_xsk_wakeup+0x53/0x90 [mlx5_core] >>>> >>>> What happens is that c->async_icosq is NULL when in mlx5e_xsk_wakeup() >>>> and therefore access to c->async_icosq->state triggers it. (On the NIC >>>> there is an XDP program installed by the control plane where traffic >>>> gets redirected into an xsk map - there was no xsk pool set up yet. >>>> At some later time a xsk pool is set up and the related xsk socket is >>>> added to the xsk map of the XDP program.) >>> >>> Thanks for your report. >>> >>>> Reverting the series fixes the problems again. >>> >>> Revert is too aggressive here. A fix is preferable. >>> We're investigating the issue in order to fix it. >>> We'll update. >> Ok, sounds good. Certainly the kTLS fixes seem independent, from the cause >> of the issues I've hit it just seemed to me that they were quite fundamental >> and that perhaps a different approach would be needed (or alternatively only >> kTLS would need fixing, and the xsk optimization left as it was originally). >> Anyway, I'll keep the revert locally for now, and happy to test patches. > > Please check attached patch. > We were able to repro the issues internally and verify the fix. > We're finalizing it before submission. > > I'd be glad if you can confirm it solves the issues for you. That seems to work for me, yes. Feel free to add my Tested-by. Thanks, Daniel ^ permalink raw reply [flat|nested] 12+ messages in thread
* Re: [PATCH net-next] net/mlx5e: Undo saving per-channel async ICOSQ 2026-02-02 16:13 ` Daniel Borkmann @ 2026-02-03 9:20 ` Alice Mikityanska 2026-02-03 14:58 ` Tariq Toukan 2026-02-03 12:30 ` Tariq Toukan 1 sibling, 1 reply; 12+ messages in thread From: Alice Mikityanska @ 2026-02-03 9:20 UTC (permalink / raw) To: Daniel Borkmann, Tariq Toukan, netdev Cc: William Tu, Tariq Toukan, David Wei, Jakub Kicinski, Gal Pressman On Mon, Feb 2, 2026, at 18:13, Daniel Borkmann wrote: > Hi Tariq, > > On 2/1/26 1:50 PM, Tariq Toukan wrote: >> On 26/01/2026 11:23, Daniel Borkmann wrote: >>> On 1/25/26 9:33 AM, Tariq Toukan wrote: >>>> On 24/01/2026 0:39, Daniel Borkmann wrote: >>>>> This reverts the following commits: >>>>> >>>>> - ea945f4f3991 ("net/mlx5e: Move async ICOSQ lock into ICOSQ struct") >>>>> - 56aca3e0f730 ("net/mlx5e: Use regular ICOSQ for triggering NAPI") >>>>> - 1b080bd74840 ("net/mlx5e: Move async ICOSQ to dynamic allocation") >>>>> - abed42f9cd80 ("net/mlx5e: Conditionally create async ICOSQ") >>>>> >>>>> There are a couple of regressions on the xsk side I ran into: >>>>> >>>>> Commit 56aca3e0f730 triggers an illegal synchronize_rcu() in an RCU read- >>>>> side critical section via mlx5e_xsk_wakeup() -> mlx5e_trigger_napi_icosq() >>>>> -> synchronize_net(). The stack holds RCU read-lock in xsk_poll(). >>>>> >>>>> Additionally, this also hits a NULL pointer dereference in mlx5e_xsk_wakeup(): >>>>> >>>>> [ 103.963735] BUG: kernel NULL pointer dereference, address: 0000000000000240 >>>>> [ 103.963743] #PF: supervisor read access in kernel mode >>>>> [ 103.963746] #PF: error_code(0x0000) - not-present page >>>>> [ 103.963749] PGD 0 P4D 0 >>>>> [ 103.963752] Oops: Oops: 0000 [#1] SMP >>>>> [ 103.963756] CPU: 0 UID: 0 PID: 2255 Comm: qemu-system-x86 Not tainted 6.19.0-rc5+ #229 PREEMPT(none) >>>>> [ 103.963761] Hardware name: [...] >>>>> [ 103.963765] RIP: 0010:mlx5e_xsk_wakeup+0x53/0x90 [mlx5_core] >>>>> >>>>> What happens is that c->async_icosq is NULL when in mlx5e_xsk_wakeup() >>>>> and therefore access to c->async_icosq->state triggers it. (On the NIC >>>>> there is an XDP program installed by the control plane where traffic >>>>> gets redirected into an xsk map - there was no xsk pool set up yet. >>>>> At some later time a xsk pool is set up and the related xsk socket is >>>>> added to the xsk map of the XDP program.) >>>> >>>> Thanks for your report. >>>> >>>>> Reverting the series fixes the problems again. >>>> >>>> Revert is too aggressive here. A fix is preferable. >>>> We're investigating the issue in order to fix it. >>>> We'll update. >>> Ok, sounds good. Certainly the kTLS fixes seem independent, from the cause >>> of the issues I've hit it just seemed to me that they were quite fundamental >>> and that perhaps a different approach would be needed (or alternatively only >>> kTLS would need fixing, and the xsk optimization left as it was originally). >>> Anyway, I'll keep the revert locally for now, and happy to test patches. >> >> Please check attached patch. Hi Tariq, I also took a glance at the patch: that seems to be correct in XSK parts, and mlx5e_trigger_napi_icosq now makes sense to me after your explanation. Two small comments though: 1. I'd prefer DEBUG_NET_WARN_ON_ONCE instead of WARN_ON. At least XSK wakeup is a hot path, called frequently, so we don't want to flood dmesg with the same error, and it would be better to compile out the check in non-debug builds, hence DEBUG_NET_WARN_ON_ONCE. 2. I see you changed the condition of creating async_icosq to be created when any xdp_prog is attached, even when there are no XSK pools. Is there a case where async_icosq is useful with plain XDP without XSK? If not, I believe this condition should be XDP && XSK. Thanks, Alice >> We were able to repro the issues internally and verify the fix. >> We're finalizing it before submission. >> >> I'd be glad if you can confirm it solves the issues for you. > That seems to work for me, yes. Feel free to add my Tested-by. > > Thanks, > Daniel ^ permalink raw reply [flat|nested] 12+ messages in thread
* Re: [PATCH net-next] net/mlx5e: Undo saving per-channel async ICOSQ 2026-02-03 9:20 ` Alice Mikityanska @ 2026-02-03 14:58 ` Tariq Toukan 2026-02-03 17:54 ` Alice Mikityanska 0 siblings, 1 reply; 12+ messages in thread From: Tariq Toukan @ 2026-02-03 14:58 UTC (permalink / raw) To: Alice Mikityanska, Daniel Borkmann, netdev Cc: William Tu, Tariq Toukan, David Wei, Jakub Kicinski, Gal Pressman On 03/02/2026 11:20, Alice Mikityanska wrote: > On Mon, Feb 2, 2026, at 18:13, Daniel Borkmann wrote: >> Hi Tariq, >> >> On 2/1/26 1:50 PM, Tariq Toukan wrote: >>> On 26/01/2026 11:23, Daniel Borkmann wrote: >>>> On 1/25/26 9:33 AM, Tariq Toukan wrote: >>>>> On 24/01/2026 0:39, Daniel Borkmann wrote: >>>>>> This reverts the following commits: >>>>>> >>>>>> - ea945f4f3991 ("net/mlx5e: Move async ICOSQ lock into ICOSQ struct") >>>>>> - 56aca3e0f730 ("net/mlx5e: Use regular ICOSQ for triggering NAPI") >>>>>> - 1b080bd74840 ("net/mlx5e: Move async ICOSQ to dynamic allocation") >>>>>> - abed42f9cd80 ("net/mlx5e: Conditionally create async ICOSQ") >>>>>> >>>>>> There are a couple of regressions on the xsk side I ran into: >>>>>> >>>>>> Commit 56aca3e0f730 triggers an illegal synchronize_rcu() in an RCU read- >>>>>> side critical section via mlx5e_xsk_wakeup() -> mlx5e_trigger_napi_icosq() >>>>>> -> synchronize_net(). The stack holds RCU read-lock in xsk_poll(). >>>>>> >>>>>> Additionally, this also hits a NULL pointer dereference in mlx5e_xsk_wakeup(): >>>>>> >>>>>> [ 103.963735] BUG: kernel NULL pointer dereference, address: 0000000000000240 >>>>>> [ 103.963743] #PF: supervisor read access in kernel mode >>>>>> [ 103.963746] #PF: error_code(0x0000) - not-present page >>>>>> [ 103.963749] PGD 0 P4D 0 >>>>>> [ 103.963752] Oops: Oops: 0000 [#1] SMP >>>>>> [ 103.963756] CPU: 0 UID: 0 PID: 2255 Comm: qemu-system-x86 Not tainted 6.19.0-rc5+ #229 PREEMPT(none) >>>>>> [ 103.963761] Hardware name: [...] >>>>>> [ 103.963765] RIP: 0010:mlx5e_xsk_wakeup+0x53/0x90 [mlx5_core] >>>>>> >>>>>> What happens is that c->async_icosq is NULL when in mlx5e_xsk_wakeup() >>>>>> and therefore access to c->async_icosq->state triggers it. (On the NIC >>>>>> there is an XDP program installed by the control plane where traffic >>>>>> gets redirected into an xsk map - there was no xsk pool set up yet. >>>>>> At some later time a xsk pool is set up and the related xsk socket is >>>>>> added to the xsk map of the XDP program.) >>>>> >>>>> Thanks for your report. >>>>> >>>>>> Reverting the series fixes the problems again. >>>>> >>>>> Revert is too aggressive here. A fix is preferable. >>>>> We're investigating the issue in order to fix it. >>>>> We'll update. >>>> Ok, sounds good. Certainly the kTLS fixes seem independent, from the cause >>>> of the issues I've hit it just seemed to me that they were quite fundamental >>>> and that perhaps a different approach would be needed (or alternatively only >>>> kTLS would need fixing, and the xsk optimization left as it was originally). >>>> Anyway, I'll keep the revert locally for now, and happy to test patches. >>> >>> Please check attached patch. > > Hi Tariq, > > I also took a glance at the patch: that seems to be correct in XSK parts, and mlx5e_trigger_napi_icosq now makes sense to me after your explanation. Great. > Two small comments though: > > 1. I'd prefer DEBUG_NET_WARN_ON_ONCE instead of WARN_ON. At least XSK wakeup is a hot path, called frequently, so we don't want to flood dmesg with the same error, and it would be better to compile out the check in non-debug builds, hence DEBUG_NET_WARN_ON_ONCE. > I tend to totally drop this check. > 2. I see you changed the condition of creating async_icosq to be created when any xdp_prog is attached, even when there are no XSK pools. Is there a case where async_icosq is useful with plain XDP without XSK? If not, I believe this condition should be XDP && XSK. > You're right. As you know, the channels are not re-opened on XSK pool events. They are re-opened on XDP program attach/detach. Hence, as a simple and quick fix, we will check for XDP program and create the async ICOSQ accordingly. Currently we won't have create/destroy async ICOSQ when XSK pool is activated/deactivated. I will mention this in the commit message. This still prevents the async ICOSQ creation in default. > Thanks, > Alice > >>> We were able to repro the issues internally and verify the fix. >>> We're finalizing it before submission. >>> >>> I'd be glad if you can confirm it solves the issues for you. >> That seems to work for me, yes. Feel free to add my Tested-by. >> >> Thanks, >> Daniel ^ permalink raw reply [flat|nested] 12+ messages in thread
* Re: [PATCH net-next] net/mlx5e: Undo saving per-channel async ICOSQ 2026-02-03 14:58 ` Tariq Toukan @ 2026-02-03 17:54 ` Alice Mikityanska 2026-02-04 6:20 ` Tariq Toukan 0 siblings, 1 reply; 12+ messages in thread From: Alice Mikityanska @ 2026-02-03 17:54 UTC (permalink / raw) To: Tariq Toukan, Daniel Borkmann, netdev Cc: William Tu, Tariq Toukan, David Wei, Jakub Kicinski, Gal Pressman On Tue, Feb 3, 2026, at 16:58, Tariq Toukan wrote: > On 03/02/2026 11:20, Alice Mikityanska wrote: >> On Mon, Feb 2, 2026, at 18:13, Daniel Borkmann wrote: >>> Hi Tariq, >>> >>> On 2/1/26 1:50 PM, Tariq Toukan wrote: >>>> On 26/01/2026 11:23, Daniel Borkmann wrote: >>>>> On 1/25/26 9:33 AM, Tariq Toukan wrote: >>>>>> On 24/01/2026 0:39, Daniel Borkmann wrote: >>>>>>> This reverts the following commits: >>>>>>> >>>>>>> - ea945f4f3991 ("net/mlx5e: Move async ICOSQ lock into ICOSQ struct") >>>>>>> - 56aca3e0f730 ("net/mlx5e: Use regular ICOSQ for triggering NAPI") >>>>>>> - 1b080bd74840 ("net/mlx5e: Move async ICOSQ to dynamic allocation") >>>>>>> - abed42f9cd80 ("net/mlx5e: Conditionally create async ICOSQ") >>>>>>> >>>>>>> There are a couple of regressions on the xsk side I ran into: >>>>>>> >>>>>>> Commit 56aca3e0f730 triggers an illegal synchronize_rcu() in an RCU read- >>>>>>> side critical section via mlx5e_xsk_wakeup() -> mlx5e_trigger_napi_icosq() >>>>>>> -> synchronize_net(). The stack holds RCU read-lock in xsk_poll(). >>>>>>> >>>>>>> Additionally, this also hits a NULL pointer dereference in mlx5e_xsk_wakeup(): >>>>>>> >>>>>>> [ 103.963735] BUG: kernel NULL pointer dereference, address: 0000000000000240 >>>>>>> [ 103.963743] #PF: supervisor read access in kernel mode >>>>>>> [ 103.963746] #PF: error_code(0x0000) - not-present page >>>>>>> [ 103.963749] PGD 0 P4D 0 >>>>>>> [ 103.963752] Oops: Oops: 0000 [#1] SMP >>>>>>> [ 103.963756] CPU: 0 UID: 0 PID: 2255 Comm: qemu-system-x86 Not tainted 6.19.0-rc5+ #229 PREEMPT(none) >>>>>>> [ 103.963761] Hardware name: [...] >>>>>>> [ 103.963765] RIP: 0010:mlx5e_xsk_wakeup+0x53/0x90 [mlx5_core] >>>>>>> >>>>>>> What happens is that c->async_icosq is NULL when in mlx5e_xsk_wakeup() >>>>>>> and therefore access to c->async_icosq->state triggers it. (On the NIC >>>>>>> there is an XDP program installed by the control plane where traffic >>>>>>> gets redirected into an xsk map - there was no xsk pool set up yet. >>>>>>> At some later time a xsk pool is set up and the related xsk socket is >>>>>>> added to the xsk map of the XDP program.) >>>>>> >>>>>> Thanks for your report. >>>>>> >>>>>>> Reverting the series fixes the problems again. >>>>>> >>>>>> Revert is too aggressive here. A fix is preferable. >>>>>> We're investigating the issue in order to fix it. >>>>>> We'll update. >>>>> Ok, sounds good. Certainly the kTLS fixes seem independent, from the cause >>>>> of the issues I've hit it just seemed to me that they were quite fundamental >>>>> and that perhaps a different approach would be needed (or alternatively only >>>>> kTLS would need fixing, and the xsk optimization left as it was originally). >>>>> Anyway, I'll keep the revert locally for now, and happy to test patches. >>>> >>>> Please check attached patch. >> >> Hi Tariq, >> >> I also took a glance at the patch: that seems to be correct in XSK parts, and mlx5e_trigger_napi_icosq now makes sense to me after your explanation. > > Great. > >> Two small comments though: >> >> 1. I'd prefer DEBUG_NET_WARN_ON_ONCE instead of WARN_ON. At least XSK wakeup is a hot path, called frequently, so we don't want to flood dmesg with the same error, and it would be better to compile out the check in non-debug builds, hence DEBUG_NET_WARN_ON_ONCE. >> > > I tend to totally drop this check. > >> 2. I see you changed the condition of creating async_icosq to be created when any xdp_prog is attached, even when there are no XSK pools. Is there a case where async_icosq is useful with plain XDP without XSK? If not, I believe this condition should be XDP && XSK. >> > > You're right. > As you know, the channels are not re-opened on XSK pool events. > They are re-opened on XDP program attach/detach. Ah, that's right, I see. I didn't pay attention to where the check is located. If you reorganize the code a little bit, it's possible to implement the perfect check (XDP && XSK) || kTLS_RX, to avoid creating the unneeded async_icosq with plain XDP. (BTW, I see that ktls_rx_was_enabled is never reset to false: is it because we can't reliably fence the moment when the driver stops using it?) Anyway, it's up to you if you'd like to do it or not, but here is my suggestion. We don't have to create async_icosq in mlx5e_open_queues. Similar to how we separated mlx5e_open_xsk, we can separate mlx5e_open_async_icosq, which is where we can check (XDP && XSK) || kTLS_RX. Then mlx5e_open_channel will look like: mlx5e_open_queues(); mlx5e_open_async_icosq(); // Checks the condition internally if (xsk_pool) mlx5e_open_xsk(); mlx5e_xsk_enable_locked will also call mlx5e_open_async_icosq just before mlx5e_open_xsk. mlx5e_close_async_icosq will close it if it exists. With this design, all cases are covered: every mlx5e_open_xsk is paired with mlx5e_open_async_icosq; and kTLS RX enable flow reopens channels, which also triggers mlx5e_open_async_icosq. The conditional is DRYed in mlx5e_open_async_icosq itself, so no code repetition either. > Hence, as a simple and quick fix, we will check for XDP program and > create the async ICOSQ accordingly. > Currently we won't have create/destroy async ICOSQ when XSK pool is > activated/deactivated. I will mention this in the commit message. > > This still prevents the async ICOSQ creation in default. > >> Thanks, >> Alice >> > > >>>> We were able to repro the issues internally and verify the fix. >>>> We're finalizing it before submission. >>>> >>>> I'd be glad if you can confirm it solves the issues for you. >>> That seems to work for me, yes. Feel free to add my Tested-by. >>> >>> Thanks, >>> Daniel ^ permalink raw reply [flat|nested] 12+ messages in thread
* Re: [PATCH net-next] net/mlx5e: Undo saving per-channel async ICOSQ 2026-02-03 17:54 ` Alice Mikityanska @ 2026-02-04 6:20 ` Tariq Toukan 0 siblings, 0 replies; 12+ messages in thread From: Tariq Toukan @ 2026-02-04 6:20 UTC (permalink / raw) To: Alice Mikityanska, Daniel Borkmann, netdev Cc: William Tu, Tariq Toukan, David Wei, Jakub Kicinski, Gal Pressman On 03/02/2026 19:54, Alice Mikityanska wrote: > On Tue, Feb 3, 2026, at 16:58, Tariq Toukan wrote: >> On 03/02/2026 11:20, Alice Mikityanska wrote: >>> On Mon, Feb 2, 2026, at 18:13, Daniel Borkmann wrote: >>>> Hi Tariq, >>>> >>>> On 2/1/26 1:50 PM, Tariq Toukan wrote: >>>>> On 26/01/2026 11:23, Daniel Borkmann wrote: >>>>>> On 1/25/26 9:33 AM, Tariq Toukan wrote: >>>>>>> On 24/01/2026 0:39, Daniel Borkmann wrote: >>>>>>>> This reverts the following commits: >>>>>>>> >>>>>>>> - ea945f4f3991 ("net/mlx5e: Move async ICOSQ lock into ICOSQ struct") >>>>>>>> - 56aca3e0f730 ("net/mlx5e: Use regular ICOSQ for triggering NAPI") >>>>>>>> - 1b080bd74840 ("net/mlx5e: Move async ICOSQ to dynamic allocation") >>>>>>>> - abed42f9cd80 ("net/mlx5e: Conditionally create async ICOSQ") >>>>>>>> >>>>>>>> There are a couple of regressions on the xsk side I ran into: >>>>>>>> >>>>>>>> Commit 56aca3e0f730 triggers an illegal synchronize_rcu() in an RCU read- >>>>>>>> side critical section via mlx5e_xsk_wakeup() -> mlx5e_trigger_napi_icosq() >>>>>>>> -> synchronize_net(). The stack holds RCU read-lock in xsk_poll(). >>>>>>>> >>>>>>>> Additionally, this also hits a NULL pointer dereference in mlx5e_xsk_wakeup(): >>>>>>>> >>>>>>>> [ 103.963735] BUG: kernel NULL pointer dereference, address: 0000000000000240 >>>>>>>> [ 103.963743] #PF: supervisor read access in kernel mode >>>>>>>> [ 103.963746] #PF: error_code(0x0000) - not-present page >>>>>>>> [ 103.963749] PGD 0 P4D 0 >>>>>>>> [ 103.963752] Oops: Oops: 0000 [#1] SMP >>>>>>>> [ 103.963756] CPU: 0 UID: 0 PID: 2255 Comm: qemu-system-x86 Not tainted 6.19.0-rc5+ #229 PREEMPT(none) >>>>>>>> [ 103.963761] Hardware name: [...] >>>>>>>> [ 103.963765] RIP: 0010:mlx5e_xsk_wakeup+0x53/0x90 [mlx5_core] >>>>>>>> >>>>>>>> What happens is that c->async_icosq is NULL when in mlx5e_xsk_wakeup() >>>>>>>> and therefore access to c->async_icosq->state triggers it. (On the NIC >>>>>>>> there is an XDP program installed by the control plane where traffic >>>>>>>> gets redirected into an xsk map - there was no xsk pool set up yet. >>>>>>>> At some later time a xsk pool is set up and the related xsk socket is >>>>>>>> added to the xsk map of the XDP program.) >>>>>>> >>>>>>> Thanks for your report. >>>>>>> >>>>>>>> Reverting the series fixes the problems again. >>>>>>> >>>>>>> Revert is too aggressive here. A fix is preferable. >>>>>>> We're investigating the issue in order to fix it. >>>>>>> We'll update. >>>>>> Ok, sounds good. Certainly the kTLS fixes seem independent, from the cause >>>>>> of the issues I've hit it just seemed to me that they were quite fundamental >>>>>> and that perhaps a different approach would be needed (or alternatively only >>>>>> kTLS would need fixing, and the xsk optimization left as it was originally). >>>>>> Anyway, I'll keep the revert locally for now, and happy to test patches. >>>>> >>>>> Please check attached patch. >>> >>> Hi Tariq, >>> >>> I also took a glance at the patch: that seems to be correct in XSK parts, and mlx5e_trigger_napi_icosq now makes sense to me after your explanation. >> >> Great. >> >>> Two small comments though: >>> >>> 1. I'd prefer DEBUG_NET_WARN_ON_ONCE instead of WARN_ON. At least XSK wakeup is a hot path, called frequently, so we don't want to flood dmesg with the same error, and it would be better to compile out the check in non-debug builds, hence DEBUG_NET_WARN_ON_ONCE. >>> >> >> I tend to totally drop this check. >> >>> 2. I see you changed the condition of creating async_icosq to be created when any xdp_prog is attached, even when there are no XSK pools. Is there a case where async_icosq is useful with plain XDP without XSK? If not, I believe this condition should be XDP && XSK. >>> >> >> You're right. >> As you know, the channels are not re-opened on XSK pool events. >> They are re-opened on XDP program attach/detach. > > Ah, that's right, I see. I didn't pay attention to where the check is > located. If you reorganize the code a little bit, it's possible to > implement the perfect check (XDP && XSK) || kTLS_RX, to avoid creating > the unneeded async_icosq with plain XDP. > Right, but I won't do it here as a fix. > (BTW, I see that ktls_rx_was_enabled is never reset to false: is it > because we can't reliably fence the moment when the driver stops using > it?) > Offload feature can be disabled but offloaded connections can stay alive until they're closed. It is possible to track them and reset ktls_rx_was_enabled to false, but it's currently not implemented. > Anyway, it's up to you if you'd like to do it or not, but here is my > suggestion. We don't have to create async_icosq in mlx5e_open_queues. > Similar to how we separated mlx5e_open_xsk, we can separate > mlx5e_open_async_icosq, which is where we can check (XDP && XSK) || > kTLS_RX. > > Then mlx5e_open_channel will look like: > > mlx5e_open_queues(); > mlx5e_open_async_icosq(); // Checks the condition internally > if (xsk_pool) > mlx5e_open_xsk(); > > mlx5e_xsk_enable_locked will also call mlx5e_open_async_icosq just > before mlx5e_open_xsk. > > mlx5e_close_async_icosq will close it if it exists. > > With this design, all cases are covered: every mlx5e_open_xsk is paired > with mlx5e_open_async_icosq; and kTLS RX enable flow reopens channels, > which also triggers mlx5e_open_async_icosq. The conditional is DRYed in > mlx5e_open_async_icosq itself, so no code repetition either. > Right, it would work, and save the async icosq creation for the case (XDP && !XSK). I prefer to implement this change as a followup, and keep this fix simple. >> Hence, as a simple and quick fix, we will check for XDP program and >> create the async ICOSQ accordingly. >> Currently we won't have create/destroy async ICOSQ when XSK pool is >> activated/deactivated. I will mention this in the commit message. >> >> This still prevents the async ICOSQ creation in default. >> >>> Thanks, >>> Alice >>> >> >> >>>>> We were able to repro the issues internally and verify the fix. >>>>> We're finalizing it before submission. >>>>> >>>>> I'd be glad if you can confirm it solves the issues for you. >>>> That seems to work for me, yes. Feel free to add my Tested-by. >>>> >>>> Thanks, >>>> Daniel ^ permalink raw reply [flat|nested] 12+ messages in thread
* Re: [PATCH net-next] net/mlx5e: Undo saving per-channel async ICOSQ 2026-02-02 16:13 ` Daniel Borkmann 2026-02-03 9:20 ` Alice Mikityanska @ 2026-02-03 12:30 ` Tariq Toukan 1 sibling, 0 replies; 12+ messages in thread From: Tariq Toukan @ 2026-02-03 12:30 UTC (permalink / raw) To: Daniel Borkmann, netdev Cc: William Tu, Tariq Toukan, David Wei, Jakub Kicinski, Gal Pressman, alice.kernel On 02/02/2026 18:13, Daniel Borkmann wrote: > Hi Tariq, > > On 2/1/26 1:50 PM, Tariq Toukan wrote: >> On 26/01/2026 11:23, Daniel Borkmann wrote: >>> On 1/25/26 9:33 AM, Tariq Toukan wrote: >>>> On 24/01/2026 0:39, Daniel Borkmann wrote: >>>>> This reverts the following commits: >>>>> >>>>> - ea945f4f3991 ("net/mlx5e: Move async ICOSQ lock into ICOSQ >>>>> struct") >>>>> - 56aca3e0f730 ("net/mlx5e: Use regular ICOSQ for triggering NAPI") >>>>> - 1b080bd74840 ("net/mlx5e: Move async ICOSQ to dynamic >>>>> allocation") >>>>> - abed42f9cd80 ("net/mlx5e: Conditionally create async ICOSQ") >>>>> >>>>> There are a couple of regressions on the xsk side I ran into: >>>>> >>>>> Commit 56aca3e0f730 triggers an illegal synchronize_rcu() in an RCU >>>>> read- >>>>> side critical section via mlx5e_xsk_wakeup() -> >>>>> mlx5e_trigger_napi_icosq() >>>>> -> synchronize_net(). The stack holds RCU read-lock in xsk_poll(). >>>>> >>>>> Additionally, this also hits a NULL pointer dereference in >>>>> mlx5e_xsk_wakeup(): >>>>> >>>>> [ 103.963735] BUG: kernel NULL pointer dereference, address: >>>>> 0000000000000240 >>>>> [ 103.963743] #PF: supervisor read access in kernel mode >>>>> [ 103.963746] #PF: error_code(0x0000) - not-present page >>>>> [ 103.963749] PGD 0 P4D 0 >>>>> [ 103.963752] Oops: Oops: 0000 [#1] SMP >>>>> [ 103.963756] CPU: 0 UID: 0 PID: 2255 Comm: qemu-system-x86 Not >>>>> tainted 6.19.0-rc5+ #229 PREEMPT(none) >>>>> [ 103.963761] Hardware name: [...] >>>>> [ 103.963765] RIP: 0010:mlx5e_xsk_wakeup+0x53/0x90 [mlx5_core] >>>>> >>>>> What happens is that c->async_icosq is NULL when in mlx5e_xsk_wakeup() >>>>> and therefore access to c->async_icosq->state triggers it. (On the NIC >>>>> there is an XDP program installed by the control plane where traffic >>>>> gets redirected into an xsk map - there was no xsk pool set up yet. >>>>> At some later time a xsk pool is set up and the related xsk socket is >>>>> added to the xsk map of the XDP program.) >>>> >>>> Thanks for your report. >>>> >>>>> Reverting the series fixes the problems again. >>>> >>>> Revert is too aggressive here. A fix is preferable. >>>> We're investigating the issue in order to fix it. >>>> We'll update. >>> Ok, sounds good. Certainly the kTLS fixes seem independent, from the >>> cause >>> of the issues I've hit it just seemed to me that they were quite >>> fundamental >>> and that perhaps a different approach would be needed (or >>> alternatively only >>> kTLS would need fixing, and the xsk optimization left as it was >>> originally). >>> Anyway, I'll keep the revert locally for now, and happy to test patches. >> >> Please check attached patch. >> We were able to repro the issues internally and verify the fix. >> We're finalizing it before submission. >> >> I'd be glad if you can confirm it solves the issues for you. > That seems to work for me, yes. Feel free to add my Tested-by. > > Thanks, > Daniel Great! Thanks for testing it. ^ permalink raw reply [flat|nested] 12+ messages in thread
* Re: [PATCH net-next] net/mlx5e: Undo saving per-channel async ICOSQ 2026-01-25 8:33 ` Tariq Toukan 2026-01-26 9:23 ` Daniel Borkmann @ 2026-01-26 16:06 ` Alice Mikityanska 2026-01-26 20:54 ` Tariq Toukan 1 sibling, 1 reply; 12+ messages in thread From: Alice Mikityanska @ 2026-01-26 16:06 UTC (permalink / raw) To: Tariq Toukan, William Tu, Tariq Toukan Cc: David Wei, Jakub Kicinski, Gal Pressman, Daniel Borkmann, netdev On Sun, Jan 25, 2026, at 10:33, Tariq Toukan wrote: > On 24/01/2026 0:39, Daniel Borkmann wrote: >> This reverts the following commits: >> >> - ea945f4f3991 ("net/mlx5e: Move async ICOSQ lock into ICOSQ struct") >> - 56aca3e0f730 ("net/mlx5e: Use regular ICOSQ for triggering NAPI") Hi Tariq and William, A few comments on my side: I think the second patch is wrong in how it repurposes the regular ICOSQ for XSK wakeups. mlx5e_xsk_wakeup() can be called on any CPU that doesn't necessarily match the channel's CPU. That's why the spinlock around async_icosq was needed. By replacing icosq with async_icosq in mlx5e_trigger_napi_icosq(), this patch also affects what's done in mlx5e_xsk_wakeup(). The consequences are: 1. The checks in mlx5e_xsk_wakeup() are made on async_icosq, but the regular icosq is then used. 2. As far as I understood the commit description, it's intended that mlx5e_trigger_napi_icosq() should use icosq unlocked most of the time, but the critical section is actually needed when it's called from mlx5e_xsk_wakeup(), because it can run on any CPU. Looking at the code, though, I feel there is also some typo that defeats the purpose of this patch. Every mlx5e_trigger_napi_icosq() call will test_and_set_bit(), and the set bit means do the lock. As the bit is always being set right before calling mlx5e_icosq_sync_lock(), it will lock on icosq at all times (well, unless a racy other thread calls clear_bit() in between of test_and_set_bit() and mlx5e_icosq_sync_lock()). Moreover, the way it's implemented now, synchronize_net() will also be called on each post of WQEs, causing a severe slowdown in datapath. One more thing that needs to be taken into account if you find a way to post XSK wakeups to the regular ICOSQ: you should reserve +1 WQE (for XSK NOP) when allocating the ICOSQ, and also use the MLX5E_SQ_STATE_PENDING_XSK_TX flag correctly (to avoid unnecessarily posting many NOPs and risking overflowing the queue). >> - 1b080bd74840 ("net/mlx5e: Move async ICOSQ to dynamic allocation") >> - abed42f9cd80 ("net/mlx5e: Conditionally create async ICOSQ") >> >> There are a couple of regressions on the xsk side I ran into: >> >> Commit 56aca3e0f730 triggers an illegal synchronize_rcu() in an RCU read- >> side critical section via mlx5e_xsk_wakeup() -> mlx5e_trigger_napi_icosq() >> -> synchronize_net(). The stack holds RCU read-lock in xsk_poll(). >> >> Additionally, this also hits a NULL pointer dereference in mlx5e_xsk_wakeup(): >> >> [ 103.963735] BUG: kernel NULL pointer dereference, address: 0000000000000240 >> [ 103.963743] #PF: supervisor read access in kernel mode >> [ 103.963746] #PF: error_code(0x0000) - not-present page >> [ 103.963749] PGD 0 P4D 0 >> [ 103.963752] Oops: Oops: 0000 [#1] SMP >> [ 103.963756] CPU: 0 UID: 0 PID: 2255 Comm: qemu-system-x86 Not tainted 6.19.0-rc5+ #229 PREEMPT(none) >> [ 103.963761] Hardware name: [...] >> [ 103.963765] RIP: 0010:mlx5e_xsk_wakeup+0x53/0x90 [mlx5_core] >> >> What happens is that c->async_icosq is NULL when in mlx5e_xsk_wakeup() >> and therefore access to c->async_icosq->state triggers it. (On the NIC >> there is an XDP program installed by the control plane where traffic >> gets redirected into an xsk map - there was no xsk pool set up yet. >> At some later time a xsk pool is set up and the related xsk socket is >> added to the xsk map of the XDP program.) >> > > Hi Daniel, > > Thanks for your report. > >> Reverting the series fixes the problems again. >> > > Revert is too aggressive here. A fix is preferable. > We're investigating the issue in order to fix it. > We'll update. > >> Signed-off-by: Daniel Borkmann <daniel@iogearbox.net> >> Cc: William Tu <witu@nvidia.com> >> Cc: Tariq Toukan <tariqt@nvidia.com> >> Cc: David Wei <dw@davidwei.uk> >> Cc: Jakub Kicinski <kuba@kernel.org> >> --- >> drivers/net/ethernet/mellanox/mlx5/core/en.h | 26 +---- >> .../mellanox/mlx5/core/en/reporter_tx.c | 1 - >> .../ethernet/mellanox/mlx5/core/en/xsk/rx.c | 3 - >> .../ethernet/mellanox/mlx5/core/en/xsk/tx.c | 6 +- >> .../mellanox/mlx5/core/en_accel/ktls.c | 10 +- >> .../mellanox/mlx5/core/en_accel/ktls_rx.c | 26 ++--- >> .../mellanox/mlx5/core/en_accel/ktls_txrx.h | 3 +- >> .../net/ethernet/mellanox/mlx5/core/en_main.c | 100 +++++------------- >> .../net/ethernet/mellanox/mlx5/core/en_rx.c | 4 - >> .../net/ethernet/mellanox/mlx5/core/en_txrx.c | 37 +++---- >> 10 files changed, 62 insertions(+), 154 deletions(-) >> >> diff --git a/drivers/net/ethernet/mellanox/mlx5/core/en.h b/drivers/net/ethernet/mellanox/mlx5/core/en.h >> index 19b9683f4622..ff4ab4691baf 100644 >> --- a/drivers/net/ethernet/mellanox/mlx5/core/en.h >> +++ b/drivers/net/ethernet/mellanox/mlx5/core/en.h >> @@ -388,7 +388,6 @@ enum { >> MLX5E_SQ_STATE_DIM, >> MLX5E_SQ_STATE_PENDING_XSK_TX, >> MLX5E_SQ_STATE_PENDING_TLS_RX_RESYNC, >> - MLX5E_SQ_STATE_LOCK_NEEDED, >> MLX5E_NUM_SQ_STATES, /* Must be kept last */ >> }; >> >> @@ -546,11 +545,6 @@ struct mlx5e_icosq { >> u32 sqn; >> u16 reserved_room; >> unsigned long state; >> - /* icosq can be accessed from any CPU and from different contexts >> - * (NAPI softirq or process/workqueue). Always use spin_lock_bh for >> - * simplicity and correctness across all contexts. >> - */ >> - spinlock_t lock; >> struct mlx5e_ktls_resync_resp *ktls_resync; >> >> /* control path */ >> @@ -782,7 +776,9 @@ struct mlx5e_channel { >> struct mlx5e_xdpsq xsksq; >> >> /* Async ICOSQ */ >> - struct mlx5e_icosq *async_icosq; >> + struct mlx5e_icosq async_icosq; >> + /* async_icosq can be accessed from any CPU - the spinlock protects it. */ >> + spinlock_t async_icosq_lock; >> >> /* data path - accessed per napi poll */ >> const struct cpumask *aff_mask; >> @@ -805,21 +801,6 @@ struct mlx5e_channel { >> struct dim_cq_moder tx_cq_moder; >> }; >> >> -static inline bool mlx5e_icosq_sync_lock(struct mlx5e_icosq *sq) >> -{ >> - if (likely(!test_bit(MLX5E_SQ_STATE_LOCK_NEEDED, &sq->state))) >> - return false; >> - >> - spin_lock_bh(&sq->lock); >> - return true; >> -} >> - >> -static inline void mlx5e_icosq_sync_unlock(struct mlx5e_icosq *sq, bool locked) >> -{ >> - if (unlikely(locked)) >> - spin_unlock_bh(&sq->lock); >> -} >> - >> struct mlx5e_ptp; >> >> struct mlx5e_channels { >> @@ -939,7 +920,6 @@ struct mlx5e_priv { >> u8 max_opened_tc; >> bool tx_ptp_opened; >> bool rx_ptp_opened; >> - bool ktls_rx_was_enabled; >> struct kernel_hwtstamp_config hwtstamp_config; >> u16 q_counter[MLX5_SD_MAX_GROUP_SZ]; >> u16 drop_rq_q_counter; >> diff --git a/drivers/net/ethernet/mellanox/mlx5/core/en/reporter_tx.c b/drivers/net/ethernet/mellanox/mlx5/core/en/reporter_tx.c >> index 4adc1adf9897..9e2cf191ed30 100644 >> --- a/drivers/net/ethernet/mellanox/mlx5/core/en/reporter_tx.c >> +++ b/drivers/net/ethernet/mellanox/mlx5/core/en/reporter_tx.c >> @@ -15,7 +15,6 @@ static const char * const sq_sw_state_type_name[] = { >> [MLX5E_SQ_STATE_DIM] = "dim", >> [MLX5E_SQ_STATE_PENDING_XSK_TX] = "pending_xsk_tx", >> [MLX5E_SQ_STATE_PENDING_TLS_RX_RESYNC] = "pending_tls_rx_resync", >> - [MLX5E_SQ_STATE_LOCK_NEEDED] = "lock_needed", >> }; >> >> static int mlx5e_wait_for_sq_flush(struct mlx5e_txqsq *sq) >> diff --git a/drivers/net/ethernet/mellanox/mlx5/core/en/xsk/rx.c b/drivers/net/ethernet/mellanox/mlx5/core/en/xsk/rx.c >> index 4f984f6a2cb9..2b05536d564a 100644 >> --- a/drivers/net/ethernet/mellanox/mlx5/core/en/xsk/rx.c >> +++ b/drivers/net/ethernet/mellanox/mlx5/core/en/xsk/rx.c >> @@ -23,7 +23,6 @@ int mlx5e_xsk_alloc_rx_mpwqe(struct mlx5e_rq *rq, u16 ix) >> struct mlx5_wq_cyc *wq = &icosq->wq; >> struct mlx5e_umr_wqe *umr_wqe; >> struct xdp_buff **xsk_buffs; >> - bool sync_locked; >> int batch, i; >> u32 offset; /* 17-bit value with MTT. */ >> u16 pi; >> @@ -48,7 +47,6 @@ int mlx5e_xsk_alloc_rx_mpwqe(struct mlx5e_rq *rq, u16 ix) >> goto err_reuse_batch; >> } >> >> - sync_locked = mlx5e_icosq_sync_lock(icosq); >> pi = mlx5e_icosq_get_next_pi(icosq, rq->mpwqe.umr_wqebbs); >> umr_wqe = mlx5_wq_cyc_get_wqe(wq, pi); >> memcpy(umr_wqe, &rq->mpwqe.umr_wqe, sizeof(struct mlx5e_umr_wqe)); >> @@ -145,7 +143,6 @@ int mlx5e_xsk_alloc_rx_mpwqe(struct mlx5e_rq *rq, u16 ix) >> }; >> >> icosq->pc += rq->mpwqe.umr_wqebbs; >> - mlx5e_icosq_sync_unlock(icosq, sync_locked); >> >> icosq->doorbell_cseg = &umr_wqe->hdr.ctrl; >> >> diff --git a/drivers/net/ethernet/mellanox/mlx5/core/en/xsk/tx.c b/drivers/net/ethernet/mellanox/mlx5/core/en/xsk/tx.c >> index 9e33156fac8a..a59199ed590d 100644 >> --- a/drivers/net/ethernet/mellanox/mlx5/core/en/xsk/tx.c >> +++ b/drivers/net/ethernet/mellanox/mlx5/core/en/xsk/tx.c >> @@ -26,12 +26,10 @@ int mlx5e_xsk_wakeup(struct net_device *dev, u32 qid, u32 flags) >> * active and not polled by NAPI. Return 0, because the upcoming >> * activate will trigger the IRQ for us. >> */ >> - if (unlikely(!test_bit(MLX5E_SQ_STATE_ENABLED, >> - &c->async_icosq->state))) >> + if (unlikely(!test_bit(MLX5E_SQ_STATE_ENABLED, &c->async_icosq.state))) >> return 0; >> >> - if (test_and_set_bit(MLX5E_SQ_STATE_PENDING_XSK_TX, >> - &c->async_icosq->state)) >> + if (test_and_set_bit(MLX5E_SQ_STATE_PENDING_XSK_TX, &c->async_icosq.state)) >> return 0; >> >> mlx5e_trigger_napi_icosq(c); >> diff --git a/drivers/net/ethernet/mellanox/mlx5/core/en_accel/ktls.c b/drivers/net/ethernet/mellanox/mlx5/core/en_accel/ktls.c >> index 1c2cc2aad2b0..e3e57c849436 100644 >> --- a/drivers/net/ethernet/mellanox/mlx5/core/en_accel/ktls.c >> +++ b/drivers/net/ethernet/mellanox/mlx5/core/en_accel/ktls.c >> @@ -135,15 +135,10 @@ int mlx5e_ktls_set_feature_rx(struct net_device *netdev, bool enable) >> int err = 0; >> >> mutex_lock(&priv->state_lock); >> - if (enable) { >> + if (enable) >> err = mlx5e_accel_fs_tcp_create(priv->fs); >> - if (!err && !priv->ktls_rx_was_enabled) { >> - priv->ktls_rx_was_enabled = true; >> - mlx5e_safe_reopen_channels(priv); >> - } >> - } else { >> + else >> mlx5e_accel_fs_tcp_destroy(priv->fs); >> - } >> mutex_unlock(&priv->state_lock); >> >> return err; >> @@ -166,7 +161,6 @@ int mlx5e_ktls_init_rx(struct mlx5e_priv *priv) >> destroy_workqueue(priv->tls->rx_wq); >> return err; >> } >> - priv->ktls_rx_was_enabled = true; >> } >> >> return 0; >> diff --git a/drivers/net/ethernet/mellanox/mlx5/core/en_accel/ktls_rx.c b/drivers/net/ethernet/mellanox/mlx5/core/en_accel/ktls_rx.c >> index 5d8fe252799e..da2d1eb52c13 100644 >> --- a/drivers/net/ethernet/mellanox/mlx5/core/en_accel/ktls_rx.c >> +++ b/drivers/net/ethernet/mellanox/mlx5/core/en_accel/ktls_rx.c >> @@ -202,8 +202,8 @@ static int post_rx_param_wqes(struct mlx5e_channel *c, >> int err; >> >> err = 0; >> - sq = c->async_icosq; >> - spin_lock_bh(&sq->lock); >> + sq = &c->async_icosq; >> + spin_lock_bh(&c->async_icosq_lock); >> >> cseg = post_static_params(sq, priv_rx); >> if (IS_ERR(cseg)) >> @@ -214,7 +214,7 @@ static int post_rx_param_wqes(struct mlx5e_channel *c, >> >> mlx5e_notify_hw(&sq->wq, sq->pc, sq->uar_map, cseg); >> unlock: >> - spin_unlock_bh(&sq->lock); >> + spin_unlock_bh(&c->async_icosq_lock); >> >> return err; >> >> @@ -277,10 +277,10 @@ resync_post_get_progress_params(struct mlx5e_icosq *sq, >> >> buf->priv_rx = priv_rx; >> >> - spin_lock_bh(&sq->lock); >> + spin_lock_bh(&sq->channel->async_icosq_lock); >> >> if (unlikely(!mlx5e_icosq_can_post_wqe(sq, MLX5E_KTLS_GET_PROGRESS_WQEBBS))) { >> - spin_unlock_bh(&sq->lock); >> + spin_unlock_bh(&sq->channel->async_icosq_lock); >> err = -ENOSPC; >> goto err_dma_unmap; >> } >> @@ -311,7 +311,7 @@ resync_post_get_progress_params(struct mlx5e_icosq *sq, >> icosq_fill_wi(sq, pi, &wi); >> sq->pc++; >> mlx5e_notify_hw(&sq->wq, sq->pc, sq->uar_map, cseg); >> - spin_unlock_bh(&sq->lock); >> + spin_unlock_bh(&sq->channel->async_icosq_lock); >> >> return 0; >> >> @@ -344,7 +344,7 @@ static void resync_handle_work(struct work_struct *work) >> } >> >> c = resync->priv->channels.c[priv_rx->rxq]; >> - sq = c->async_icosq; >> + sq = &c->async_icosq; >> >> if (resync_post_get_progress_params(sq, priv_rx)) { >> priv_rx->rq_stats->tls_resync_req_skip++; >> @@ -371,7 +371,7 @@ static void resync_handle_seq_match(struct mlx5e_ktls_offload_context_rx *priv_r >> struct mlx5e_icosq *sq; >> bool trigger_poll; >> >> - sq = c->async_icosq; >> + sq = &c->async_icosq; >> ktls_resync = sq->ktls_resync; >> trigger_poll = false; >> >> @@ -413,9 +413,9 @@ static void resync_handle_seq_match(struct mlx5e_ktls_offload_context_rx *priv_r >> return; >> >> if (!napi_if_scheduled_mark_missed(&c->napi)) { >> - spin_lock_bh(&sq->lock); >> + spin_lock_bh(&c->async_icosq_lock); >> mlx5e_trigger_irq(sq); >> - spin_unlock_bh(&sq->lock); >> + spin_unlock_bh(&c->async_icosq_lock); >> } >> } >> >> @@ -753,7 +753,7 @@ bool mlx5e_ktls_rx_handle_resync_list(struct mlx5e_channel *c, int budget) >> LIST_HEAD(local_list); >> int i, j; >> >> - sq = c->async_icosq; >> + sq = &c->async_icosq; >> >> if (unlikely(!test_bit(MLX5E_SQ_STATE_ENABLED, &sq->state))) >> return false; >> @@ -772,7 +772,7 @@ bool mlx5e_ktls_rx_handle_resync_list(struct mlx5e_channel *c, int budget) >> clear_bit(MLX5E_SQ_STATE_PENDING_TLS_RX_RESYNC, &sq->state); >> spin_unlock(&ktls_resync->lock); >> >> - spin_lock(&sq->lock); >> + spin_lock(&c->async_icosq_lock); >> for (j = 0; j < i; j++) { >> struct mlx5_wqe_ctrl_seg *cseg; >> >> @@ -791,7 +791,7 @@ bool mlx5e_ktls_rx_handle_resync_list(struct mlx5e_channel *c, int budget) >> } >> if (db_cseg) >> mlx5e_notify_hw(&sq->wq, sq->pc, sq->uar_map, db_cseg); >> - spin_unlock(&sq->lock); >> + spin_unlock(&c->async_icosq_lock); >> >> priv_rx->rq_stats->tls_resync_res_ok += j; >> >> diff --git a/drivers/net/ethernet/mellanox/mlx5/core/en_accel/ktls_txrx.h b/drivers/net/ethernet/mellanox/mlx5/core/en_accel/ktls_txrx.h >> index 4022c7e78a2e..cb08799769ee 100644 >> --- a/drivers/net/ethernet/mellanox/mlx5/core/en_accel/ktls_txrx.h >> +++ b/drivers/net/ethernet/mellanox/mlx5/core/en_accel/ktls_txrx.h >> @@ -50,8 +50,7 @@ bool mlx5e_ktls_rx_handle_resync_list(struct mlx5e_channel *c, int budget); >> static inline bool >> mlx5e_ktls_rx_pending_resync_list(struct mlx5e_channel *c, int budget) >> { >> - return budget && test_bit(MLX5E_SQ_STATE_PENDING_TLS_RX_RESYNC, >> - &c->async_icosq->state); >> + return budget && test_bit(MLX5E_SQ_STATE_PENDING_TLS_RX_RESYNC, &c->async_icosq.state); >> } >> >> static inline void >> diff --git a/drivers/net/ethernet/mellanox/mlx5/core/en_main.c b/drivers/net/ethernet/mellanox/mlx5/core/en_main.c >> index e1e05c9e7ebb..446510153e5e 100644 >> --- a/drivers/net/ethernet/mellanox/mlx5/core/en_main.c >> +++ b/drivers/net/ethernet/mellanox/mlx5/core/en_main.c >> @@ -2075,8 +2075,6 @@ static int mlx5e_open_icosq(struct mlx5e_channel *c, struct mlx5e_params *params >> if (err) >> goto err_free_icosq; >> >> - spin_lock_init(&sq->lock); >> - >> if (param->is_tls) { >> sq->ktls_resync = mlx5e_ktls_rx_resync_create_resp_list(); >> if (IS_ERR(sq->ktls_resync)) { >> @@ -2589,51 +2587,9 @@ static int mlx5e_open_rxq_rq(struct mlx5e_channel *c, struct mlx5e_params *param >> return mlx5e_open_rq(params, rq_params, NULL, cpu_to_node(c->cpu), q_counter, &c->rq); >> } >> >> -static struct mlx5e_icosq * >> -mlx5e_open_async_icosq(struct mlx5e_channel *c, >> - struct mlx5e_params *params, >> - struct mlx5e_channel_param *cparam, >> - struct mlx5e_create_cq_param *ccp) >> -{ >> - struct dim_cq_moder icocq_moder = {0, 0}; >> - struct mlx5e_icosq *async_icosq; >> - int err; >> - >> - async_icosq = kvzalloc_node(sizeof(*async_icosq), GFP_KERNEL, >> - cpu_to_node(c->cpu)); >> - if (!async_icosq) >> - return ERR_PTR(-ENOMEM); >> - >> - err = mlx5e_open_cq(c->mdev, icocq_moder, &cparam->async_icosq.cqp, ccp, >> - &async_icosq->cq); >> - if (err) >> - goto err_free_async_icosq; >> - >> - err = mlx5e_open_icosq(c, params, &cparam->async_icosq, async_icosq, >> - mlx5e_async_icosq_err_cqe_work); >> - if (err) >> - goto err_close_async_icosq_cq; >> - >> - return async_icosq; >> - >> -err_close_async_icosq_cq: >> - mlx5e_close_cq(&async_icosq->cq); >> -err_free_async_icosq: >> - kvfree(async_icosq); >> - return ERR_PTR(err); >> -} >> - >> -static void mlx5e_close_async_icosq(struct mlx5e_icosq *async_icosq) >> -{ >> - mlx5e_close_icosq(async_icosq); >> - mlx5e_close_cq(&async_icosq->cq); >> - kvfree(async_icosq); >> -} >> - >> static int mlx5e_open_queues(struct mlx5e_channel *c, >> struct mlx5e_params *params, >> - struct mlx5e_channel_param *cparam, >> - bool async_icosq_needed) >> + struct mlx5e_channel_param *cparam) >> { >> const struct net_device_ops *netdev_ops = c->netdev->netdev_ops; >> struct dim_cq_moder icocq_moder = {0, 0}; >> @@ -2642,10 +2598,15 @@ static int mlx5e_open_queues(struct mlx5e_channel *c, >> >> mlx5e_build_create_cq_param(&ccp, c); >> >> + err = mlx5e_open_cq(c->mdev, icocq_moder, &cparam->async_icosq.cqp, &ccp, >> + &c->async_icosq.cq); >> + if (err) >> + return err; >> + >> err = mlx5e_open_cq(c->mdev, icocq_moder, &cparam->icosq.cqp, &ccp, >> &c->icosq.cq); >> if (err) >> - return err; >> + goto err_close_async_icosq_cq; >> >> err = mlx5e_open_tx_cqs(c, params, &ccp, cparam); >> if (err) >> @@ -2669,14 +2630,12 @@ static int mlx5e_open_queues(struct mlx5e_channel *c, >> if (err) >> goto err_close_rx_cq; >> >> - if (async_icosq_needed) { >> - c->async_icosq = mlx5e_open_async_icosq(c, params, cparam, >> - &ccp); >> - if (IS_ERR(c->async_icosq)) { >> - err = PTR_ERR(c->async_icosq); >> - goto err_close_rq_xdpsq_cq; >> - } >> - } >> + spin_lock_init(&c->async_icosq_lock); >> + >> + err = mlx5e_open_icosq(c, params, &cparam->async_icosq, &c->async_icosq, >> + mlx5e_async_icosq_err_cqe_work); >> + if (err) >> + goto err_close_rq_xdpsq_cq; >> >> mutex_init(&c->icosq_recovery_lock); >> >> @@ -2712,8 +2671,7 @@ static int mlx5e_open_queues(struct mlx5e_channel *c, >> mlx5e_close_icosq(&c->icosq); >> >> err_close_async_icosq: >> - if (c->async_icosq) >> - mlx5e_close_async_icosq(c->async_icosq); >> + mlx5e_close_icosq(&c->async_icosq); >> >> err_close_rq_xdpsq_cq: >> if (c->xdp) >> @@ -2732,6 +2690,9 @@ static int mlx5e_open_queues(struct mlx5e_channel *c, >> err_close_icosq_cq: >> mlx5e_close_cq(&c->icosq.cq); >> >> +err_close_async_icosq_cq: >> + mlx5e_close_cq(&c->async_icosq.cq); >> + >> return err; >> } >> >> @@ -2745,8 +2706,7 @@ static void mlx5e_close_queues(struct mlx5e_channel *c) >> mlx5e_close_sqs(c); >> mlx5e_close_icosq(&c->icosq); >> mutex_destroy(&c->icosq_recovery_lock); >> - if (c->async_icosq) >> - mlx5e_close_async_icosq(c->async_icosq); >> + mlx5e_close_icosq(&c->async_icosq); >> if (c->xdp) >> mlx5e_close_cq(&c->rq_xdpsq.cq); >> mlx5e_close_cq(&c->rq.cq); >> @@ -2754,6 +2714,7 @@ static void mlx5e_close_queues(struct mlx5e_channel *c) >> mlx5e_close_xdpredirect_sq(c->xdpsq); >> mlx5e_close_tx_cqs(c); >> mlx5e_close_cq(&c->icosq.cq); >> + mlx5e_close_cq(&c->async_icosq.cq); >> } >> >> static u8 mlx5e_enumerate_lag_port(struct mlx5_core_dev *mdev, int ix) >> @@ -2789,16 +2750,9 @@ static int mlx5e_channel_stats_alloc(struct mlx5e_priv *priv, int ix, int cpu) >> >> void mlx5e_trigger_napi_icosq(struct mlx5e_channel *c) >> { >> - bool locked; >> - >> - if (!test_and_set_bit(MLX5E_SQ_STATE_LOCK_NEEDED, &c->icosq.state)) >> - synchronize_net(); >> - >> - locked = mlx5e_icosq_sync_lock(&c->icosq); >> - mlx5e_trigger_irq(&c->icosq); >> - mlx5e_icosq_sync_unlock(&c->icosq, locked); >> - >> - clear_bit(MLX5E_SQ_STATE_LOCK_NEEDED, &c->icosq.state); >> + spin_lock_bh(&c->async_icosq_lock); >> + mlx5e_trigger_irq(&c->async_icosq); >> + spin_unlock_bh(&c->async_icosq_lock); >> } >> >> void mlx5e_trigger_napi_sched(struct napi_struct *napi) >> @@ -2831,7 +2785,6 @@ static int mlx5e_open_channel(struct mlx5e_priv *priv, int ix, >> struct mlx5e_channel_param *cparam; >> struct mlx5_core_dev *mdev; >> struct mlx5e_xsk_param xsk; >> - bool async_icosq_needed; >> struct mlx5e_channel *c; >> unsigned int irq; >> int vec_ix; >> @@ -2881,8 +2834,7 @@ static int mlx5e_open_channel(struct mlx5e_priv *priv, int ix, >> netif_napi_add_config_locked(netdev, &c->napi, mlx5e_napi_poll, ix); >> netif_napi_set_irq_locked(&c->napi, irq); >> >> - async_icosq_needed = !!xsk_pool || priv->ktls_rx_was_enabled; >> - err = mlx5e_open_queues(c, params, cparam, async_icosq_needed); >> + err = mlx5e_open_queues(c, params, cparam); >> if (unlikely(err)) >> goto err_napi_del; >> >> @@ -2920,8 +2872,7 @@ static void mlx5e_activate_channel(struct mlx5e_channel *c) >> for (tc = 0; tc < c->num_tc; tc++) >> mlx5e_activate_txqsq(&c->sq[tc]); >> mlx5e_activate_icosq(&c->icosq); >> - if (c->async_icosq) >> - mlx5e_activate_icosq(c->async_icosq); >> + mlx5e_activate_icosq(&c->async_icosq); >> >> if (test_bit(MLX5E_CHANNEL_STATE_XSK, c->state)) >> mlx5e_activate_xsk(c); >> @@ -2942,8 +2893,7 @@ static void mlx5e_deactivate_channel(struct mlx5e_channel *c) >> else >> mlx5e_deactivate_rq(&c->rq); >> >> - if (c->async_icosq) >> - mlx5e_deactivate_icosq(c->async_icosq); >> + mlx5e_deactivate_icosq(&c->async_icosq); >> mlx5e_deactivate_icosq(&c->icosq); >> for (tc = 0; tc < c->num_tc; tc++) >> mlx5e_deactivate_txqsq(&c->sq[tc]); >> diff --git a/drivers/net/ethernet/mellanox/mlx5/core/en_rx.c b/drivers/net/ethernet/mellanox/mlx5/core/en_rx.c >> index 1fc3720d2201..1f6930c77437 100644 >> --- a/drivers/net/ethernet/mellanox/mlx5/core/en_rx.c >> +++ b/drivers/net/ethernet/mellanox/mlx5/core/en_rx.c >> @@ -778,7 +778,6 @@ static int mlx5e_alloc_rx_mpwqe(struct mlx5e_rq *rq, u16 ix) >> struct mlx5_wq_cyc *wq = &sq->wq; >> struct mlx5e_umr_wqe *umr_wqe; >> u32 offset; /* 17-bit value with MTT. */ >> - bool sync_locked; >> u16 pi; >> int err; >> int i; >> @@ -789,7 +788,6 @@ static int mlx5e_alloc_rx_mpwqe(struct mlx5e_rq *rq, u16 ix) >> goto err; >> } >> >> - sync_locked = mlx5e_icosq_sync_lock(sq); >> pi = mlx5e_icosq_get_next_pi(sq, rq->mpwqe.umr_wqebbs); >> umr_wqe = mlx5_wq_cyc_get_wqe(wq, pi); >> memcpy(umr_wqe, &rq->mpwqe.umr_wqe, sizeof(struct mlx5e_umr_wqe)); >> @@ -837,14 +835,12 @@ static int mlx5e_alloc_rx_mpwqe(struct mlx5e_rq *rq, u16 ix) >> }; >> >> sq->pc += rq->mpwqe.umr_wqebbs; >> - mlx5e_icosq_sync_unlock(sq, sync_locked); >> >> sq->doorbell_cseg = &umr_wqe->hdr.ctrl; >> >> return 0; >> >> err_unmap: >> - mlx5e_icosq_sync_unlock(sq, sync_locked); >> while (--i >= 0) { >> frag_page--; >> mlx5e_page_release_fragmented(rq->page_pool, frag_page); >> diff --git a/drivers/net/ethernet/mellanox/mlx5/core/en_txrx.c b/drivers/net/ethernet/mellanox/mlx5/core/en_txrx.c >> index b31f689fe271..76108299ea57 100644 >> --- a/drivers/net/ethernet/mellanox/mlx5/core/en_txrx.c >> +++ b/drivers/net/ethernet/mellanox/mlx5/core/en_txrx.c >> @@ -125,7 +125,6 @@ int mlx5e_napi_poll(struct napi_struct *napi, int budget) >> { >> struct mlx5e_channel *c = container_of(napi, struct mlx5e_channel, >> napi); >> - struct mlx5e_icosq *aicosq = c->async_icosq; >> struct mlx5e_ch_stats *ch_stats = c->stats; >> struct mlx5e_xdpsq *xsksq = &c->xsksq; >> struct mlx5e_txqsq __rcu **qos_sqs; >> @@ -181,18 +180,15 @@ int mlx5e_napi_poll(struct napi_struct *napi, int budget) >> busy |= work_done == budget; >> >> mlx5e_poll_ico_cq(&c->icosq.cq); >> - if (aicosq) { >> - if (mlx5e_poll_ico_cq(&aicosq->cq)) >> - /* Don't clear the flag if nothing was polled to prevent >> - * queueing more WQEs and overflowing the async ICOSQ. >> - */ >> - clear_bit(MLX5E_SQ_STATE_PENDING_XSK_TX, >> - &aicosq->state); >> - >> - /* Keep after async ICOSQ CQ poll */ >> - if (unlikely(mlx5e_ktls_rx_pending_resync_list(c, budget))) >> - busy |= mlx5e_ktls_rx_handle_resync_list(c, budget); >> - } >> + if (mlx5e_poll_ico_cq(&c->async_icosq.cq)) >> + /* Don't clear the flag if nothing was polled to prevent >> + * queueing more WQEs and overflowing the async ICOSQ. >> + */ >> + clear_bit(MLX5E_SQ_STATE_PENDING_XSK_TX, &c->async_icosq.state); >> + >> + /* Keep after async ICOSQ CQ poll */ >> + if (unlikely(mlx5e_ktls_rx_pending_resync_list(c, budget))) >> + busy |= mlx5e_ktls_rx_handle_resync_list(c, budget); >> >> busy |= INDIRECT_CALL_2(rq->post_wqes, >> mlx5e_post_rx_mpwqes, >> @@ -240,17 +236,16 @@ int mlx5e_napi_poll(struct napi_struct *napi, int budget) >> >> mlx5e_cq_arm(&rq->cq); >> mlx5e_cq_arm(&c->icosq.cq); >> - if (aicosq) { >> - mlx5e_cq_arm(&aicosq->cq); >> - if (xsk_open) { >> - mlx5e_handle_rx_dim(xskrq); >> - mlx5e_cq_arm(&xsksq->cq); >> - mlx5e_cq_arm(&xskrq->cq); >> - } >> - } >> + mlx5e_cq_arm(&c->async_icosq.cq); >> if (c->xdpsq) >> mlx5e_cq_arm(&c->xdpsq->cq); >> >> + if (xsk_open) { >> + mlx5e_handle_rx_dim(xskrq); >> + mlx5e_cq_arm(&xsksq->cq); >> + mlx5e_cq_arm(&xskrq->cq); >> + } >> + >> if (unlikely(aff_change && busy_xsk)) { >> mlx5e_trigger_irq(&c->icosq); >> ch_stats->force_irq++; ^ permalink raw reply [flat|nested] 12+ messages in thread
* Re: [PATCH net-next] net/mlx5e: Undo saving per-channel async ICOSQ 2026-01-26 16:06 ` Alice Mikityanska @ 2026-01-26 20:54 ` Tariq Toukan 0 siblings, 0 replies; 12+ messages in thread From: Tariq Toukan @ 2026-01-26 20:54 UTC (permalink / raw) To: Alice Mikityanska, William Tu, Tariq Toukan Cc: David Wei, Jakub Kicinski, Gal Pressman, Daniel Borkmann, netdev On 26/01/2026 18:06, Alice Mikityanska wrote: > On Sun, Jan 25, 2026, at 10:33, Tariq Toukan wrote: >> On 24/01/2026 0:39, Daniel Borkmann wrote: >>> This reverts the following commits: >>> >>> - ea945f4f3991 ("net/mlx5e: Move async ICOSQ lock into ICOSQ struct") >>> - 56aca3e0f730 ("net/mlx5e: Use regular ICOSQ for triggering NAPI") > > Hi Tariq and William, > Hi Alice, Thanks for your valuable comments! > A few comments on my side: > > I think the second patch is wrong in how it repurposes the regular ICOSQ for XSK wakeups. mlx5e_xsk_wakeup() can be called on any CPU that doesn't necessarily match the channel's CPU. That's why the spinlock around async_icosq was needed. By replacing icosq with async_icosq in mlx5e_trigger_napi_icosq(), this patch also affects what's done in mlx5e_xsk_wakeup(). Accurate. We had no intention of moving XSK to use the "regular" ICOSQ. It was done by mistake. As you can see, all checks in XSK are done against the async ICOSQ, and the conditional creation of async ICOSQ checks for xsk_pool. I have WIP patches fixing this, moving the XSK nop posting back to the async ICOSQ. > The consequences are: > > 1. The checks in mlx5e_xsk_wakeup() are made on async_icosq, but the regular icosq is then used. > > 2. As far as I understood the commit description, it's intended that mlx5e_trigger_napi_icosq() should use icosq unlocked most of the time, but the critical section is actually needed when it's called from mlx5e_xsk_wakeup(), because it can run on any CPU. > > Looking at the code, though, I feel there is also some typo that defeats the purpose of this patch. Every mlx5e_trigger_napi_icosq() call will test_and_set_bit(), and the set bit means do the lock. As the bit is always being set right before calling mlx5e_icosq_sync_lock(), it will lock on icosq at all times The MLX5E_SQ_STATE_LOCK_NEEDED bit is used to sync against the RX fast-path flow in napi that posts UMR WQEs to the icosq. This is an optimization, as the lock is not needed in the napi flow once the activation stage completes. I think you're right about mlx5e_trigger_napi_icosq: The "test" part seems redundant, as it's the only place where the bit is set. (In earlier internal versions that was not the case, and this is some kind of a leftover). I'd still prefer using the API (mlx5e_icosq_sync_lock) to obtain the lock though. > (well, unless a racy other thread calls clear_bit() in between of test_and_set_bit() and mlx5e_icosq_sync_lock()). Moreover, the way it's implemented now, synchronize_net() will also be called on each post of WQEs, causing a severe slowdown in datapath. > No, why? mlx5e_trigger_napi_icosq is a slow-path function (after the XSK issue gets fixed). > One more thing that needs to be taken into account if you find a way to post XSK wakeups to the regular ICOSQ: you should reserve +1 WQE (for XSK NOP) when allocating the ICOSQ, and also use the MLX5E_SQ_STATE_PENDING_XSK_TX flag correctly (to avoid unnecessarily posting many NOPs and risking overflowing the queue). > Right, but won't be relevant after the fix. >>> - 1b080bd74840 ("net/mlx5e: Move async ICOSQ to dynamic allocation") >>> - abed42f9cd80 ("net/mlx5e: Conditionally create async ICOSQ") >>> >>> There are a couple of regressions on the xsk side I ran into: >>> >>> Commit 56aca3e0f730 triggers an illegal synchronize_rcu() in an RCU read- >>> side critical section via mlx5e_xsk_wakeup() -> mlx5e_trigger_napi_icosq() >>> -> synchronize_net(). The stack holds RCU read-lock in xsk_poll(). >>> >>> Additionally, this also hits a NULL pointer dereference in mlx5e_xsk_wakeup(): >>> >>> [ 103.963735] BUG: kernel NULL pointer dereference, address: 0000000000000240 >>> [ 103.963743] #PF: supervisor read access in kernel mode >>> [ 103.963746] #PF: error_code(0x0000) - not-present page >>> [ 103.963749] PGD 0 P4D 0 >>> [ 103.963752] Oops: Oops: 0000 [#1] SMP >>> [ 103.963756] CPU: 0 UID: 0 PID: 2255 Comm: qemu-system-x86 Not tainted 6.19.0-rc5+ #229 PREEMPT(none) >>> [ 103.963761] Hardware name: [...] >>> [ 103.963765] RIP: 0010:mlx5e_xsk_wakeup+0x53/0x90 [mlx5_core] >>> >>> What happens is that c->async_icosq is NULL when in mlx5e_xsk_wakeup() >>> and therefore access to c->async_icosq->state triggers it. (On the NIC >>> there is an XDP program installed by the control plane where traffic >>> gets redirected into an xsk map - there was no xsk pool set up yet. >>> At some later time a xsk pool is set up and the related xsk socket is >>> added to the xsk map of the XDP program.) >>> >> >> Hi Daniel, >> >> Thanks for your report. >> >>> Reverting the series fixes the problems again. >>> >> >> Revert is too aggressive here. A fix is preferable. >> We're investigating the issue in order to fix it. >> We'll update. >> >>> Signed-off-by: Daniel Borkmann <daniel@iogearbox.net> >>> Cc: William Tu <witu@nvidia.com> >>> Cc: Tariq Toukan <tariqt@nvidia.com> >>> Cc: David Wei <dw@davidwei.uk> >>> Cc: Jakub Kicinski <kuba@kernel.org> >>> --- >>> drivers/net/ethernet/mellanox/mlx5/core/en.h | 26 +---- >>> .../mellanox/mlx5/core/en/reporter_tx.c | 1 - >>> .../ethernet/mellanox/mlx5/core/en/xsk/rx.c | 3 - >>> .../ethernet/mellanox/mlx5/core/en/xsk/tx.c | 6 +- >>> .../mellanox/mlx5/core/en_accel/ktls.c | 10 +- >>> .../mellanox/mlx5/core/en_accel/ktls_rx.c | 26 ++--- >>> .../mellanox/mlx5/core/en_accel/ktls_txrx.h | 3 +- >>> .../net/ethernet/mellanox/mlx5/core/en_main.c | 100 +++++------------- >>> .../net/ethernet/mellanox/mlx5/core/en_rx.c | 4 - >>> .../net/ethernet/mellanox/mlx5/core/en_txrx.c | 37 +++---- >>> 10 files changed, 62 insertions(+), 154 deletions(-) >>> >>> diff --git a/drivers/net/ethernet/mellanox/mlx5/core/en.h b/drivers/net/ethernet/mellanox/mlx5/core/en.h >>> index 19b9683f4622..ff4ab4691baf 100644 >>> --- a/drivers/net/ethernet/mellanox/mlx5/core/en.h >>> +++ b/drivers/net/ethernet/mellanox/mlx5/core/en.h >>> @@ -388,7 +388,6 @@ enum { >>> MLX5E_SQ_STATE_DIM, >>> MLX5E_SQ_STATE_PENDING_XSK_TX, >>> MLX5E_SQ_STATE_PENDING_TLS_RX_RESYNC, >>> - MLX5E_SQ_STATE_LOCK_NEEDED, >>> MLX5E_NUM_SQ_STATES, /* Must be kept last */ >>> }; >>> >>> @@ -546,11 +545,6 @@ struct mlx5e_icosq { >>> u32 sqn; >>> u16 reserved_room; >>> unsigned long state; >>> - /* icosq can be accessed from any CPU and from different contexts >>> - * (NAPI softirq or process/workqueue). Always use spin_lock_bh for >>> - * simplicity and correctness across all contexts. >>> - */ >>> - spinlock_t lock; >>> struct mlx5e_ktls_resync_resp *ktls_resync; >>> >>> /* control path */ >>> @@ -782,7 +776,9 @@ struct mlx5e_channel { >>> struct mlx5e_xdpsq xsksq; >>> >>> /* Async ICOSQ */ >>> - struct mlx5e_icosq *async_icosq; >>> + struct mlx5e_icosq async_icosq; >>> + /* async_icosq can be accessed from any CPU - the spinlock protects it. */ >>> + spinlock_t async_icosq_lock; >>> >>> /* data path - accessed per napi poll */ >>> const struct cpumask *aff_mask; >>> @@ -805,21 +801,6 @@ struct mlx5e_channel { >>> struct dim_cq_moder tx_cq_moder; >>> }; >>> >>> -static inline bool mlx5e_icosq_sync_lock(struct mlx5e_icosq *sq) >>> -{ >>> - if (likely(!test_bit(MLX5E_SQ_STATE_LOCK_NEEDED, &sq->state))) >>> - return false; >>> - >>> - spin_lock_bh(&sq->lock); >>> - return true; >>> -} >>> - >>> -static inline void mlx5e_icosq_sync_unlock(struct mlx5e_icosq *sq, bool locked) >>> -{ >>> - if (unlikely(locked)) >>> - spin_unlock_bh(&sq->lock); >>> -} >>> - >>> struct mlx5e_ptp; >>> >>> struct mlx5e_channels { >>> @@ -939,7 +920,6 @@ struct mlx5e_priv { >>> u8 max_opened_tc; >>> bool tx_ptp_opened; >>> bool rx_ptp_opened; >>> - bool ktls_rx_was_enabled; >>> struct kernel_hwtstamp_config hwtstamp_config; >>> u16 q_counter[MLX5_SD_MAX_GROUP_SZ]; >>> u16 drop_rq_q_counter; >>> diff --git a/drivers/net/ethernet/mellanox/mlx5/core/en/reporter_tx.c b/drivers/net/ethernet/mellanox/mlx5/core/en/reporter_tx.c >>> index 4adc1adf9897..9e2cf191ed30 100644 >>> --- a/drivers/net/ethernet/mellanox/mlx5/core/en/reporter_tx.c >>> +++ b/drivers/net/ethernet/mellanox/mlx5/core/en/reporter_tx.c >>> @@ -15,7 +15,6 @@ static const char * const sq_sw_state_type_name[] = { >>> [MLX5E_SQ_STATE_DIM] = "dim", >>> [MLX5E_SQ_STATE_PENDING_XSK_TX] = "pending_xsk_tx", >>> [MLX5E_SQ_STATE_PENDING_TLS_RX_RESYNC] = "pending_tls_rx_resync", >>> - [MLX5E_SQ_STATE_LOCK_NEEDED] = "lock_needed", >>> }; >>> >>> static int mlx5e_wait_for_sq_flush(struct mlx5e_txqsq *sq) >>> diff --git a/drivers/net/ethernet/mellanox/mlx5/core/en/xsk/rx.c b/drivers/net/ethernet/mellanox/mlx5/core/en/xsk/rx.c >>> index 4f984f6a2cb9..2b05536d564a 100644 >>> --- a/drivers/net/ethernet/mellanox/mlx5/core/en/xsk/rx.c >>> +++ b/drivers/net/ethernet/mellanox/mlx5/core/en/xsk/rx.c >>> @@ -23,7 +23,6 @@ int mlx5e_xsk_alloc_rx_mpwqe(struct mlx5e_rq *rq, u16 ix) >>> struct mlx5_wq_cyc *wq = &icosq->wq; >>> struct mlx5e_umr_wqe *umr_wqe; >>> struct xdp_buff **xsk_buffs; >>> - bool sync_locked; >>> int batch, i; >>> u32 offset; /* 17-bit value with MTT. */ >>> u16 pi; >>> @@ -48,7 +47,6 @@ int mlx5e_xsk_alloc_rx_mpwqe(struct mlx5e_rq *rq, u16 ix) >>> goto err_reuse_batch; >>> } >>> >>> - sync_locked = mlx5e_icosq_sync_lock(icosq); >>> pi = mlx5e_icosq_get_next_pi(icosq, rq->mpwqe.umr_wqebbs); >>> umr_wqe = mlx5_wq_cyc_get_wqe(wq, pi); >>> memcpy(umr_wqe, &rq->mpwqe.umr_wqe, sizeof(struct mlx5e_umr_wqe)); >>> @@ -145,7 +143,6 @@ int mlx5e_xsk_alloc_rx_mpwqe(struct mlx5e_rq *rq, u16 ix) >>> }; >>> >>> icosq->pc += rq->mpwqe.umr_wqebbs; >>> - mlx5e_icosq_sync_unlock(icosq, sync_locked); >>> >>> icosq->doorbell_cseg = &umr_wqe->hdr.ctrl; >>> >>> diff --git a/drivers/net/ethernet/mellanox/mlx5/core/en/xsk/tx.c b/drivers/net/ethernet/mellanox/mlx5/core/en/xsk/tx.c >>> index 9e33156fac8a..a59199ed590d 100644 >>> --- a/drivers/net/ethernet/mellanox/mlx5/core/en/xsk/tx.c >>> +++ b/drivers/net/ethernet/mellanox/mlx5/core/en/xsk/tx.c >>> @@ -26,12 +26,10 @@ int mlx5e_xsk_wakeup(struct net_device *dev, u32 qid, u32 flags) >>> * active and not polled by NAPI. Return 0, because the upcoming >>> * activate will trigger the IRQ for us. >>> */ >>> - if (unlikely(!test_bit(MLX5E_SQ_STATE_ENABLED, >>> - &c->async_icosq->state))) >>> + if (unlikely(!test_bit(MLX5E_SQ_STATE_ENABLED, &c->async_icosq.state))) >>> return 0; >>> >>> - if (test_and_set_bit(MLX5E_SQ_STATE_PENDING_XSK_TX, >>> - &c->async_icosq->state)) >>> + if (test_and_set_bit(MLX5E_SQ_STATE_PENDING_XSK_TX, &c->async_icosq.state)) >>> return 0; >>> >>> mlx5e_trigger_napi_icosq(c); >>> diff --git a/drivers/net/ethernet/mellanox/mlx5/core/en_accel/ktls.c b/drivers/net/ethernet/mellanox/mlx5/core/en_accel/ktls.c >>> index 1c2cc2aad2b0..e3e57c849436 100644 >>> --- a/drivers/net/ethernet/mellanox/mlx5/core/en_accel/ktls.c >>> +++ b/drivers/net/ethernet/mellanox/mlx5/core/en_accel/ktls.c >>> @@ -135,15 +135,10 @@ int mlx5e_ktls_set_feature_rx(struct net_device *netdev, bool enable) >>> int err = 0; >>> >>> mutex_lock(&priv->state_lock); >>> - if (enable) { >>> + if (enable) >>> err = mlx5e_accel_fs_tcp_create(priv->fs); >>> - if (!err && !priv->ktls_rx_was_enabled) { >>> - priv->ktls_rx_was_enabled = true; >>> - mlx5e_safe_reopen_channels(priv); >>> - } >>> - } else { >>> + else >>> mlx5e_accel_fs_tcp_destroy(priv->fs); >>> - } >>> mutex_unlock(&priv->state_lock); >>> >>> return err; >>> @@ -166,7 +161,6 @@ int mlx5e_ktls_init_rx(struct mlx5e_priv *priv) >>> destroy_workqueue(priv->tls->rx_wq); >>> return err; >>> } >>> - priv->ktls_rx_was_enabled = true; >>> } >>> >>> return 0; >>> diff --git a/drivers/net/ethernet/mellanox/mlx5/core/en_accel/ktls_rx.c b/drivers/net/ethernet/mellanox/mlx5/core/en_accel/ktls_rx.c >>> index 5d8fe252799e..da2d1eb52c13 100644 >>> --- a/drivers/net/ethernet/mellanox/mlx5/core/en_accel/ktls_rx.c >>> +++ b/drivers/net/ethernet/mellanox/mlx5/core/en_accel/ktls_rx.c >>> @@ -202,8 +202,8 @@ static int post_rx_param_wqes(struct mlx5e_channel *c, >>> int err; >>> >>> err = 0; >>> - sq = c->async_icosq; >>> - spin_lock_bh(&sq->lock); >>> + sq = &c->async_icosq; >>> + spin_lock_bh(&c->async_icosq_lock); >>> >>> cseg = post_static_params(sq, priv_rx); >>> if (IS_ERR(cseg)) >>> @@ -214,7 +214,7 @@ static int post_rx_param_wqes(struct mlx5e_channel *c, >>> >>> mlx5e_notify_hw(&sq->wq, sq->pc, sq->uar_map, cseg); >>> unlock: >>> - spin_unlock_bh(&sq->lock); >>> + spin_unlock_bh(&c->async_icosq_lock); >>> >>> return err; >>> >>> @@ -277,10 +277,10 @@ resync_post_get_progress_params(struct mlx5e_icosq *sq, >>> >>> buf->priv_rx = priv_rx; >>> >>> - spin_lock_bh(&sq->lock); >>> + spin_lock_bh(&sq->channel->async_icosq_lock); >>> >>> if (unlikely(!mlx5e_icosq_can_post_wqe(sq, MLX5E_KTLS_GET_PROGRESS_WQEBBS))) { >>> - spin_unlock_bh(&sq->lock); >>> + spin_unlock_bh(&sq->channel->async_icosq_lock); >>> err = -ENOSPC; >>> goto err_dma_unmap; >>> } >>> @@ -311,7 +311,7 @@ resync_post_get_progress_params(struct mlx5e_icosq *sq, >>> icosq_fill_wi(sq, pi, &wi); >>> sq->pc++; >>> mlx5e_notify_hw(&sq->wq, sq->pc, sq->uar_map, cseg); >>> - spin_unlock_bh(&sq->lock); >>> + spin_unlock_bh(&sq->channel->async_icosq_lock); >>> >>> return 0; >>> >>> @@ -344,7 +344,7 @@ static void resync_handle_work(struct work_struct *work) >>> } >>> >>> c = resync->priv->channels.c[priv_rx->rxq]; >>> - sq = c->async_icosq; >>> + sq = &c->async_icosq; >>> >>> if (resync_post_get_progress_params(sq, priv_rx)) { >>> priv_rx->rq_stats->tls_resync_req_skip++; >>> @@ -371,7 +371,7 @@ static void resync_handle_seq_match(struct mlx5e_ktls_offload_context_rx *priv_r >>> struct mlx5e_icosq *sq; >>> bool trigger_poll; >>> >>> - sq = c->async_icosq; >>> + sq = &c->async_icosq; >>> ktls_resync = sq->ktls_resync; >>> trigger_poll = false; >>> >>> @@ -413,9 +413,9 @@ static void resync_handle_seq_match(struct mlx5e_ktls_offload_context_rx *priv_r >>> return; >>> >>> if (!napi_if_scheduled_mark_missed(&c->napi)) { >>> - spin_lock_bh(&sq->lock); >>> + spin_lock_bh(&c->async_icosq_lock); >>> mlx5e_trigger_irq(sq); >>> - spin_unlock_bh(&sq->lock); >>> + spin_unlock_bh(&c->async_icosq_lock); >>> } >>> } >>> >>> @@ -753,7 +753,7 @@ bool mlx5e_ktls_rx_handle_resync_list(struct mlx5e_channel *c, int budget) >>> LIST_HEAD(local_list); >>> int i, j; >>> >>> - sq = c->async_icosq; >>> + sq = &c->async_icosq; >>> >>> if (unlikely(!test_bit(MLX5E_SQ_STATE_ENABLED, &sq->state))) >>> return false; >>> @@ -772,7 +772,7 @@ bool mlx5e_ktls_rx_handle_resync_list(struct mlx5e_channel *c, int budget) >>> clear_bit(MLX5E_SQ_STATE_PENDING_TLS_RX_RESYNC, &sq->state); >>> spin_unlock(&ktls_resync->lock); >>> >>> - spin_lock(&sq->lock); >>> + spin_lock(&c->async_icosq_lock); >>> for (j = 0; j < i; j++) { >>> struct mlx5_wqe_ctrl_seg *cseg; >>> >>> @@ -791,7 +791,7 @@ bool mlx5e_ktls_rx_handle_resync_list(struct mlx5e_channel *c, int budget) >>> } >>> if (db_cseg) >>> mlx5e_notify_hw(&sq->wq, sq->pc, sq->uar_map, db_cseg); >>> - spin_unlock(&sq->lock); >>> + spin_unlock(&c->async_icosq_lock); >>> >>> priv_rx->rq_stats->tls_resync_res_ok += j; >>> >>> diff --git a/drivers/net/ethernet/mellanox/mlx5/core/en_accel/ktls_txrx.h b/drivers/net/ethernet/mellanox/mlx5/core/en_accel/ktls_txrx.h >>> index 4022c7e78a2e..cb08799769ee 100644 >>> --- a/drivers/net/ethernet/mellanox/mlx5/core/en_accel/ktls_txrx.h >>> +++ b/drivers/net/ethernet/mellanox/mlx5/core/en_accel/ktls_txrx.h >>> @@ -50,8 +50,7 @@ bool mlx5e_ktls_rx_handle_resync_list(struct mlx5e_channel *c, int budget); >>> static inline bool >>> mlx5e_ktls_rx_pending_resync_list(struct mlx5e_channel *c, int budget) >>> { >>> - return budget && test_bit(MLX5E_SQ_STATE_PENDING_TLS_RX_RESYNC, >>> - &c->async_icosq->state); >>> + return budget && test_bit(MLX5E_SQ_STATE_PENDING_TLS_RX_RESYNC, &c->async_icosq.state); >>> } >>> >>> static inline void >>> diff --git a/drivers/net/ethernet/mellanox/mlx5/core/en_main.c b/drivers/net/ethernet/mellanox/mlx5/core/en_main.c >>> index e1e05c9e7ebb..446510153e5e 100644 >>> --- a/drivers/net/ethernet/mellanox/mlx5/core/en_main.c >>> +++ b/drivers/net/ethernet/mellanox/mlx5/core/en_main.c >>> @@ -2075,8 +2075,6 @@ static int mlx5e_open_icosq(struct mlx5e_channel *c, struct mlx5e_params *params >>> if (err) >>> goto err_free_icosq; >>> >>> - spin_lock_init(&sq->lock); >>> - >>> if (param->is_tls) { >>> sq->ktls_resync = mlx5e_ktls_rx_resync_create_resp_list(); >>> if (IS_ERR(sq->ktls_resync)) { >>> @@ -2589,51 +2587,9 @@ static int mlx5e_open_rxq_rq(struct mlx5e_channel *c, struct mlx5e_params *param >>> return mlx5e_open_rq(params, rq_params, NULL, cpu_to_node(c->cpu), q_counter, &c->rq); >>> } >>> >>> -static struct mlx5e_icosq * >>> -mlx5e_open_async_icosq(struct mlx5e_channel *c, >>> - struct mlx5e_params *params, >>> - struct mlx5e_channel_param *cparam, >>> - struct mlx5e_create_cq_param *ccp) >>> -{ >>> - struct dim_cq_moder icocq_moder = {0, 0}; >>> - struct mlx5e_icosq *async_icosq; >>> - int err; >>> - >>> - async_icosq = kvzalloc_node(sizeof(*async_icosq), GFP_KERNEL, >>> - cpu_to_node(c->cpu)); >>> - if (!async_icosq) >>> - return ERR_PTR(-ENOMEM); >>> - >>> - err = mlx5e_open_cq(c->mdev, icocq_moder, &cparam->async_icosq.cqp, ccp, >>> - &async_icosq->cq); >>> - if (err) >>> - goto err_free_async_icosq; >>> - >>> - err = mlx5e_open_icosq(c, params, &cparam->async_icosq, async_icosq, >>> - mlx5e_async_icosq_err_cqe_work); >>> - if (err) >>> - goto err_close_async_icosq_cq; >>> - >>> - return async_icosq; >>> - >>> -err_close_async_icosq_cq: >>> - mlx5e_close_cq(&async_icosq->cq); >>> -err_free_async_icosq: >>> - kvfree(async_icosq); >>> - return ERR_PTR(err); >>> -} >>> - >>> -static void mlx5e_close_async_icosq(struct mlx5e_icosq *async_icosq) >>> -{ >>> - mlx5e_close_icosq(async_icosq); >>> - mlx5e_close_cq(&async_icosq->cq); >>> - kvfree(async_icosq); >>> -} >>> - >>> static int mlx5e_open_queues(struct mlx5e_channel *c, >>> struct mlx5e_params *params, >>> - struct mlx5e_channel_param *cparam, >>> - bool async_icosq_needed) >>> + struct mlx5e_channel_param *cparam) >>> { >>> const struct net_device_ops *netdev_ops = c->netdev->netdev_ops; >>> struct dim_cq_moder icocq_moder = {0, 0}; >>> @@ -2642,10 +2598,15 @@ static int mlx5e_open_queues(struct mlx5e_channel *c, >>> >>> mlx5e_build_create_cq_param(&ccp, c); >>> >>> + err = mlx5e_open_cq(c->mdev, icocq_moder, &cparam->async_icosq.cqp, &ccp, >>> + &c->async_icosq.cq); >>> + if (err) >>> + return err; >>> + >>> err = mlx5e_open_cq(c->mdev, icocq_moder, &cparam->icosq.cqp, &ccp, >>> &c->icosq.cq); >>> if (err) >>> - return err; >>> + goto err_close_async_icosq_cq; >>> >>> err = mlx5e_open_tx_cqs(c, params, &ccp, cparam); >>> if (err) >>> @@ -2669,14 +2630,12 @@ static int mlx5e_open_queues(struct mlx5e_channel *c, >>> if (err) >>> goto err_close_rx_cq; >>> >>> - if (async_icosq_needed) { >>> - c->async_icosq = mlx5e_open_async_icosq(c, params, cparam, >>> - &ccp); >>> - if (IS_ERR(c->async_icosq)) { >>> - err = PTR_ERR(c->async_icosq); >>> - goto err_close_rq_xdpsq_cq; >>> - } >>> - } >>> + spin_lock_init(&c->async_icosq_lock); >>> + >>> + err = mlx5e_open_icosq(c, params, &cparam->async_icosq, &c->async_icosq, >>> + mlx5e_async_icosq_err_cqe_work); >>> + if (err) >>> + goto err_close_rq_xdpsq_cq; >>> >>> mutex_init(&c->icosq_recovery_lock); >>> >>> @@ -2712,8 +2671,7 @@ static int mlx5e_open_queues(struct mlx5e_channel *c, >>> mlx5e_close_icosq(&c->icosq); >>> >>> err_close_async_icosq: >>> - if (c->async_icosq) >>> - mlx5e_close_async_icosq(c->async_icosq); >>> + mlx5e_close_icosq(&c->async_icosq); >>> >>> err_close_rq_xdpsq_cq: >>> if (c->xdp) >>> @@ -2732,6 +2690,9 @@ static int mlx5e_open_queues(struct mlx5e_channel *c, >>> err_close_icosq_cq: >>> mlx5e_close_cq(&c->icosq.cq); >>> >>> +err_close_async_icosq_cq: >>> + mlx5e_close_cq(&c->async_icosq.cq); >>> + >>> return err; >>> } >>> >>> @@ -2745,8 +2706,7 @@ static void mlx5e_close_queues(struct mlx5e_channel *c) >>> mlx5e_close_sqs(c); >>> mlx5e_close_icosq(&c->icosq); >>> mutex_destroy(&c->icosq_recovery_lock); >>> - if (c->async_icosq) >>> - mlx5e_close_async_icosq(c->async_icosq); >>> + mlx5e_close_icosq(&c->async_icosq); >>> if (c->xdp) >>> mlx5e_close_cq(&c->rq_xdpsq.cq); >>> mlx5e_close_cq(&c->rq.cq); >>> @@ -2754,6 +2714,7 @@ static void mlx5e_close_queues(struct mlx5e_channel *c) >>> mlx5e_close_xdpredirect_sq(c->xdpsq); >>> mlx5e_close_tx_cqs(c); >>> mlx5e_close_cq(&c->icosq.cq); >>> + mlx5e_close_cq(&c->async_icosq.cq); >>> } >>> >>> static u8 mlx5e_enumerate_lag_port(struct mlx5_core_dev *mdev, int ix) >>> @@ -2789,16 +2750,9 @@ static int mlx5e_channel_stats_alloc(struct mlx5e_priv *priv, int ix, int cpu) >>> >>> void mlx5e_trigger_napi_icosq(struct mlx5e_channel *c) >>> { >>> - bool locked; >>> - >>> - if (!test_and_set_bit(MLX5E_SQ_STATE_LOCK_NEEDED, &c->icosq.state)) >>> - synchronize_net(); >>> - >>> - locked = mlx5e_icosq_sync_lock(&c->icosq); >>> - mlx5e_trigger_irq(&c->icosq); >>> - mlx5e_icosq_sync_unlock(&c->icosq, locked); >>> - >>> - clear_bit(MLX5E_SQ_STATE_LOCK_NEEDED, &c->icosq.state); >>> + spin_lock_bh(&c->async_icosq_lock); >>> + mlx5e_trigger_irq(&c->async_icosq); >>> + spin_unlock_bh(&c->async_icosq_lock); >>> } >>> >>> void mlx5e_trigger_napi_sched(struct napi_struct *napi) >>> @@ -2831,7 +2785,6 @@ static int mlx5e_open_channel(struct mlx5e_priv *priv, int ix, >>> struct mlx5e_channel_param *cparam; >>> struct mlx5_core_dev *mdev; >>> struct mlx5e_xsk_param xsk; >>> - bool async_icosq_needed; >>> struct mlx5e_channel *c; >>> unsigned int irq; >>> int vec_ix; >>> @@ -2881,8 +2834,7 @@ static int mlx5e_open_channel(struct mlx5e_priv *priv, int ix, >>> netif_napi_add_config_locked(netdev, &c->napi, mlx5e_napi_poll, ix); >>> netif_napi_set_irq_locked(&c->napi, irq); >>> >>> - async_icosq_needed = !!xsk_pool || priv->ktls_rx_was_enabled; >>> - err = mlx5e_open_queues(c, params, cparam, async_icosq_needed); >>> + err = mlx5e_open_queues(c, params, cparam); >>> if (unlikely(err)) >>> goto err_napi_del; >>> >>> @@ -2920,8 +2872,7 @@ static void mlx5e_activate_channel(struct mlx5e_channel *c) >>> for (tc = 0; tc < c->num_tc; tc++) >>> mlx5e_activate_txqsq(&c->sq[tc]); >>> mlx5e_activate_icosq(&c->icosq); >>> - if (c->async_icosq) >>> - mlx5e_activate_icosq(c->async_icosq); >>> + mlx5e_activate_icosq(&c->async_icosq); >>> >>> if (test_bit(MLX5E_CHANNEL_STATE_XSK, c->state)) >>> mlx5e_activate_xsk(c); >>> @@ -2942,8 +2893,7 @@ static void mlx5e_deactivate_channel(struct mlx5e_channel *c) >>> else >>> mlx5e_deactivate_rq(&c->rq); >>> >>> - if (c->async_icosq) >>> - mlx5e_deactivate_icosq(c->async_icosq); >>> + mlx5e_deactivate_icosq(&c->async_icosq); >>> mlx5e_deactivate_icosq(&c->icosq); >>> for (tc = 0; tc < c->num_tc; tc++) >>> mlx5e_deactivate_txqsq(&c->sq[tc]); >>> diff --git a/drivers/net/ethernet/mellanox/mlx5/core/en_rx.c b/drivers/net/ethernet/mellanox/mlx5/core/en_rx.c >>> index 1fc3720d2201..1f6930c77437 100644 >>> --- a/drivers/net/ethernet/mellanox/mlx5/core/en_rx.c >>> +++ b/drivers/net/ethernet/mellanox/mlx5/core/en_rx.c >>> @@ -778,7 +778,6 @@ static int mlx5e_alloc_rx_mpwqe(struct mlx5e_rq *rq, u16 ix) >>> struct mlx5_wq_cyc *wq = &sq->wq; >>> struct mlx5e_umr_wqe *umr_wqe; >>> u32 offset; /* 17-bit value with MTT. */ >>> - bool sync_locked; >>> u16 pi; >>> int err; >>> int i; >>> @@ -789,7 +788,6 @@ static int mlx5e_alloc_rx_mpwqe(struct mlx5e_rq *rq, u16 ix) >>> goto err; >>> } >>> >>> - sync_locked = mlx5e_icosq_sync_lock(sq); >>> pi = mlx5e_icosq_get_next_pi(sq, rq->mpwqe.umr_wqebbs); >>> umr_wqe = mlx5_wq_cyc_get_wqe(wq, pi); >>> memcpy(umr_wqe, &rq->mpwqe.umr_wqe, sizeof(struct mlx5e_umr_wqe)); >>> @@ -837,14 +835,12 @@ static int mlx5e_alloc_rx_mpwqe(struct mlx5e_rq *rq, u16 ix) >>> }; >>> >>> sq->pc += rq->mpwqe.umr_wqebbs; >>> - mlx5e_icosq_sync_unlock(sq, sync_locked); >>> >>> sq->doorbell_cseg = &umr_wqe->hdr.ctrl; >>> >>> return 0; >>> >>> err_unmap: >>> - mlx5e_icosq_sync_unlock(sq, sync_locked); >>> while (--i >= 0) { >>> frag_page--; >>> mlx5e_page_release_fragmented(rq->page_pool, frag_page); >>> diff --git a/drivers/net/ethernet/mellanox/mlx5/core/en_txrx.c b/drivers/net/ethernet/mellanox/mlx5/core/en_txrx.c >>> index b31f689fe271..76108299ea57 100644 >>> --- a/drivers/net/ethernet/mellanox/mlx5/core/en_txrx.c >>> +++ b/drivers/net/ethernet/mellanox/mlx5/core/en_txrx.c >>> @@ -125,7 +125,6 @@ int mlx5e_napi_poll(struct napi_struct *napi, int budget) >>> { >>> struct mlx5e_channel *c = container_of(napi, struct mlx5e_channel, >>> napi); >>> - struct mlx5e_icosq *aicosq = c->async_icosq; >>> struct mlx5e_ch_stats *ch_stats = c->stats; >>> struct mlx5e_xdpsq *xsksq = &c->xsksq; >>> struct mlx5e_txqsq __rcu **qos_sqs; >>> @@ -181,18 +180,15 @@ int mlx5e_napi_poll(struct napi_struct *napi, int budget) >>> busy |= work_done == budget; >>> >>> mlx5e_poll_ico_cq(&c->icosq.cq); >>> - if (aicosq) { >>> - if (mlx5e_poll_ico_cq(&aicosq->cq)) >>> - /* Don't clear the flag if nothing was polled to prevent >>> - * queueing more WQEs and overflowing the async ICOSQ. >>> - */ >>> - clear_bit(MLX5E_SQ_STATE_PENDING_XSK_TX, >>> - &aicosq->state); >>> - >>> - /* Keep after async ICOSQ CQ poll */ >>> - if (unlikely(mlx5e_ktls_rx_pending_resync_list(c, budget))) >>> - busy |= mlx5e_ktls_rx_handle_resync_list(c, budget); >>> - } >>> + if (mlx5e_poll_ico_cq(&c->async_icosq.cq)) >>> + /* Don't clear the flag if nothing was polled to prevent >>> + * queueing more WQEs and overflowing the async ICOSQ. >>> + */ >>> + clear_bit(MLX5E_SQ_STATE_PENDING_XSK_TX, &c->async_icosq.state); >>> + >>> + /* Keep after async ICOSQ CQ poll */ >>> + if (unlikely(mlx5e_ktls_rx_pending_resync_list(c, budget))) >>> + busy |= mlx5e_ktls_rx_handle_resync_list(c, budget); >>> >>> busy |= INDIRECT_CALL_2(rq->post_wqes, >>> mlx5e_post_rx_mpwqes, >>> @@ -240,17 +236,16 @@ int mlx5e_napi_poll(struct napi_struct *napi, int budget) >>> >>> mlx5e_cq_arm(&rq->cq); >>> mlx5e_cq_arm(&c->icosq.cq); >>> - if (aicosq) { >>> - mlx5e_cq_arm(&aicosq->cq); >>> - if (xsk_open) { >>> - mlx5e_handle_rx_dim(xskrq); >>> - mlx5e_cq_arm(&xsksq->cq); >>> - mlx5e_cq_arm(&xskrq->cq); >>> - } >>> - } >>> + mlx5e_cq_arm(&c->async_icosq.cq); >>> if (c->xdpsq) >>> mlx5e_cq_arm(&c->xdpsq->cq); >>> >>> + if (xsk_open) { >>> + mlx5e_handle_rx_dim(xskrq); >>> + mlx5e_cq_arm(&xsksq->cq); >>> + mlx5e_cq_arm(&xskrq->cq); >>> + } >>> + >>> if (unlikely(aff_change && busy_xsk)) { >>> mlx5e_trigger_irq(&c->icosq); >>> ch_stats->force_irq++; ^ permalink raw reply [flat|nested] 12+ messages in thread
end of thread, other threads:[~2026-02-04 6:20 UTC | newest] Thread overview: 12+ messages (download: mbox.gz follow: Atom feed -- links below jump to the message on this page -- 2026-01-23 22:39 [PATCH net-next] net/mlx5e: Undo saving per-channel async ICOSQ Daniel Borkmann 2026-01-25 8:33 ` Tariq Toukan 2026-01-26 9:23 ` Daniel Borkmann 2026-02-01 12:50 ` Tariq Toukan 2026-02-02 16:13 ` Daniel Borkmann 2026-02-03 9:20 ` Alice Mikityanska 2026-02-03 14:58 ` Tariq Toukan 2026-02-03 17:54 ` Alice Mikityanska 2026-02-04 6:20 ` Tariq Toukan 2026-02-03 12:30 ` Tariq Toukan 2026-01-26 16:06 ` Alice Mikityanska 2026-01-26 20:54 ` Tariq Toukan
This is a public inbox, see mirroring instructions for how to clone and mirror all data and code used for this inbox