* [mellanox/mlx5-next RFC 1/1] net/mlx5: RX, Fix refcount warning on frag page release
2026-06-25 17:40 [mellanox/mlx5-next RFC 1/1] net/mlx5: RX, Fix refcount warning on frag page release Nabil S. Alramli
@ 2026-06-25 17:40 ` Nabil S. Alramli
0 siblings, 0 replies; 2+ messages in thread
From: Nabil S. Alramli @ 2026-06-25 17:40 UTC (permalink / raw)
To: saeedm, tariqt, mbloch, dtatulea
Cc: dev, nalramli, leon, andrew+netdev, davem, edumazet, kuba, pabeni,
netdev, linux-rdma, linux-kernel
Under memory pressure, mlx5 driver has WARNING during fragmented page
release. This happens because there is a discrepency between what mlx5
thinks the page fragment counter is vs what the page_pool actually says it
is.
The cause of the issue is page allocations on concurrent cpus, which
increment the non-atomic u16 page counter mlx5e_frag_page.frags, while at
the same time the page reference counter net_iov.pp_ref_count is atomically
incremented. That sometimes leads to a difference in the counts and
therefore triggers the warning in page_pool_unref_netmem:
```
ret = atomic_long_sub_return(nr, pp_ref_count);
WARN_ON(ret < 0);
```
The actual stack trace looks like this:
```
WARNING: CPU: 37 PID: 447795 at include/net/page_pool/helpers.h:277 mlx5e_page_release_fragmented.isra.0+0x51/0x60 [mlx5_core]
Tainted: [S]=CPU_OUT_OF_SPEC, [O]=OOT_MODULE
Hardware name: *
RIP: 0010:mlx5e_page_release_fragmented.isra.0+0x51/0x60 [mlx5_core]
RSP: 0018:ffffc90019814d98 EFLAGS: 00010293
RAX: 000000000000003f RBX: ffff88c0993d0a10 RCX: ffffea02424592c0
RDX: 0000000000000001 RSI: ffffea02424592c0 RDI: ffff88c090e20000
RBP: 000000000000000a R08: 0000000000001409 R09: 0000000000000006
R10: 0000000000000000 R11: ffff88c095fbc040 R12: 000000000000141f
R13: 0000000000000009 R14: ffff88c090e20000 R15: 0000000000000001
FS: 00007f34149fa6c0(0000) GS:ffff89200fa40000(0000) knlGS:0000000000000000
CS: 0010 DS: 0000 ES: 0000 CR0: 0000000080050033
CR2: 00007ed0265eb000 CR3: 0000005091cbe000 CR4: 0000000000350ef0
Call Trace:
<IRQ>
mlx5e_free_rx_wqes+0x7b/0xa0 [mlx5_core]
mlx5e_post_rx_wqes+0x1ac/0x5a0 [mlx5_core]
mlx5e_napi_poll+0x5e5/0x6f0 [mlx5_core]
__napi_poll+0x2b/0x1a0
net_rx_action+0x30e/0x370
? sched_clock+0x9/0x10
? sched_clock_cpu+0xf/0x170
handle_softirqs+0xe2/0x2a0
common_interrupt+0x85/0xa0
</IRQ>
<TASK>
asm_common_interrupt+0x26/0x40
RIP: 0010:page_counter_uncharge+0x34/0x90
RSP: 0018:ffffc900e728bb00 EFLAGS: 00000213
RAX: ffff88aff4762000 RBX: ffff88aff4762100 RCX: 0000000000000304
RDX: 0000000000000001 RSI: 00000000004e9e1a RDI: ffff88aff4762100
RBP: 0000000000000001 R08: ffff891ea0560048 R09: 00007ffffffff000
R10: 0000000000001000 R11: ffff891ae8061b00 R12: ffffffffffffffff
R13: ffff89107fcfd4c0 R14: ffff891ae8061b00 R15: ffff892002fe1400
uncharge_batch+0x40/0xd0
```
The fix is to use an atomic page fragment counter, so it will always match
the number of references held in the page_pool.
Signed-off-by: Nabil S. Alramli <dev@nalramli.com>
Fixes: 6f5742846053 ("net/mlx5e: RX, Enable skb page recycling through the page_pool")
---
drivers/net/ethernet/mellanox/mlx5/core/en.h | 2 +-
.../net/ethernet/mellanox/mlx5/core/en_main.c | 2 +-
.../net/ethernet/mellanox/mlx5/core/en_rx.c | 39 ++++++++++---------
3 files changed, 22 insertions(+), 21 deletions(-)
diff --git a/drivers/net/ethernet/mellanox/mlx5/core/en.h b/drivers/net/ethernet/mellanox/mlx5/core/en.h
index 2270e2e550dd..c164106eb85d 100644
--- a/drivers/net/ethernet/mellanox/mlx5/core/en.h
+++ b/drivers/net/ethernet/mellanox/mlx5/core/en.h
@@ -568,7 +568,7 @@ struct mlx5e_icosq {
struct mlx5e_frag_page {
netmem_ref netmem;
- u16 frags;
+ atomic_long_t frags;
};
enum mlx5e_wqe_frag_flag {
diff --git a/drivers/net/ethernet/mellanox/mlx5/core/en_main.c b/drivers/net/ethernet/mellanox/mlx5/core/en_main.c
index 5a46870c4b74..571a0df9f604 100644
--- a/drivers/net/ethernet/mellanox/mlx5/core/en_main.c
+++ b/drivers/net/ethernet/mellanox/mlx5/core/en_main.c
@@ -400,7 +400,7 @@ static int mlx5e_rq_alloc_mpwqe_linear_info(struct mlx5e_rq *rq, int node,
rq->mpwqe.linear_info = li;
/* Set to max to force allocation on first run. */
- li->frag_page.frags = li->max_frags;
+ atomic_long_set(&li->frag_page.frags, li->max_frags);
return 0;
}
diff --git a/drivers/net/ethernet/mellanox/mlx5/core/en_rx.c b/drivers/net/ethernet/mellanox/mlx5/core/en_rx.c
index 5b60aa47c75b..ee360fa0c316 100644
--- a/drivers/net/ethernet/mellanox/mlx5/core/en_rx.c
+++ b/drivers/net/ethernet/mellanox/mlx5/core/en_rx.c
@@ -284,7 +284,7 @@ static int mlx5e_page_alloc_fragmented(struct page_pool *pp,
*frag_page = (struct mlx5e_frag_page) {
.netmem = netmem,
- .frags = 0,
+ .frags = ATOMIC_LONG_INIT(0),
};
return 0;
@@ -293,7 +293,7 @@ static int mlx5e_page_alloc_fragmented(struct page_pool *pp,
static void mlx5e_page_release_fragmented(struct page_pool *pp,
struct mlx5e_frag_page *frag_page)
{
- u16 drain_count = MLX5E_PAGECNT_BIAS_MAX - frag_page->frags;
+ u16 drain_count = MLX5E_PAGECNT_BIAS_MAX - atomic_long_read(&frag_page->frags);
netmem_ref netmem = frag_page->netmem;
if (page_pool_unref_netmem(netmem, drain_count) == 0)
@@ -304,7 +304,7 @@ static int mlx5e_mpwqe_linear_page_refill(struct mlx5e_rq *rq)
{
struct mlx5e_mpw_linear_info *li = rq->mpwqe.linear_info;
- if (likely(li->frag_page.frags < li->max_frags))
+ if (likely(atomic_long_read(&li->frag_page.frags) < li->max_frags))
return 0;
if (likely(li->frag_page.netmem)) {
@@ -323,7 +323,8 @@ static void *mlx5e_mpwqe_get_linear_page_frag(struct mlx5e_rq *rq)
if (unlikely(mlx5e_mpwqe_linear_page_refill(rq)))
return NULL;
- frag_offset = li->frag_page.frags << MLX5E_XDP_LOG_MAX_LINEAR_SZ;
+ frag_offset = atomic_long_read(&li->frag_page.frags) <<
+ MLX5E_XDP_LOG_MAX_LINEAR_SZ;
WARN_ON(frag_offset >= BIT(rq->mpwqe.page_shift));
return netmem_address(li->frag_page.netmem) + frag_offset;
@@ -568,7 +569,7 @@ mlx5e_add_skb_frag(struct mlx5e_rq *rq, struct sk_buff *skb,
return;
}
- frag_page->frags++;
+ atomic_long_inc(&frag_page->frags);
skb_add_rx_frag_netmem(skb, next_frag, netmem,
frag_offset, len, truesize);
}
@@ -744,7 +745,7 @@ void mlx5e_mpwqe_dealloc_linear_page(struct mlx5e_rq *rq)
* things in a good state for re-allocation.
*/
li->frag_page.netmem = 0;
- li->frag_page.frags = li->max_frags;
+ atomic_long_set(&li->frag_page.frags, li->max_frags);
}
INDIRECT_CALLABLE_SCOPE bool mlx5e_post_rx_wqes(struct mlx5e_rq *rq)
@@ -1615,7 +1616,7 @@ mlx5e_skb_from_cqe_linear(struct mlx5e_rq *rq, struct mlx5e_wqe_frag_info *wi,
/* queue up for recycling/reuse */
skb_mark_for_recycle(skb);
- frag_page->frags++;
+ atomic_long_inc(&frag_page->frags);
return skb;
}
@@ -1683,7 +1684,7 @@ mlx5e_skb_from_cqe_nonlinear(struct mlx5e_rq *rq, struct mlx5e_wqe_frag_info *wi
struct mlx5e_wqe_frag_info *pwi;
for (pwi = head_wi; pwi < wi; pwi++)
- pwi->frag_page->frags++;
+ atomic_long_inc(&pwi->frag_page->frags);
}
return NULL; /* page/packet was consumed by XDP */
}
@@ -1702,7 +1703,7 @@ mlx5e_skb_from_cqe_nonlinear(struct mlx5e_rq *rq, struct mlx5e_wqe_frag_info *wi
return NULL;
skb_mark_for_recycle(skb);
- head_wi->frag_page->frags++;
+ atomic_long_inc(&head_wi->frag_page->frags);
if (xdp_buff_has_frags(&mxbuf->xdp)) {
/* sinfo->nr_frags is reset by build_skb, calculate again. */
@@ -1711,7 +1712,7 @@ mlx5e_skb_from_cqe_nonlinear(struct mlx5e_rq *rq, struct mlx5e_wqe_frag_info *wi
xdp_buff_get_skb_flags(&mxbuf->xdp));
for (struct mlx5e_wqe_frag_info *pwi = head_wi + 1; pwi < wi; pwi++)
- pwi->frag_page->frags++;
+ atomic_long_inc(&pwi->frag_page->frags);
}
return skb;
@@ -1760,7 +1761,7 @@ static void mlx5e_handle_rx_cqe(struct mlx5e_rq *rq, struct mlx5_cqe64 *cqe)
if (!skb) {
/* probably for XDP */
if (__test_and_clear_bit(MLX5E_RQ_FLAG_XDP_XMIT, rq->flags))
- wi->frag_page->frags++;
+ atomic_long_inc(&wi->frag_page->frags);
goto wq_cyc_pop;
}
@@ -1808,7 +1809,7 @@ static void mlx5e_handle_rx_cqe_rep(struct mlx5e_rq *rq, struct mlx5_cqe64 *cqe)
if (!skb) {
/* probably for XDP */
if (__test_and_clear_bit(MLX5E_RQ_FLAG_XDP_XMIT, rq->flags))
- wi->frag_page->frags++;
+ atomic_long_inc(&wi->frag_page->frags);
goto wq_cyc_pop;
}
@@ -2011,9 +2012,9 @@ mlx5e_skb_from_cqe_mpwrq_nonlinear(struct mlx5e_rq *rq, struct mlx5e_mpw_info *w
struct mlx5e_frag_page *pfp;
for (pfp = head_page; pfp < frag_page; pfp++)
- pfp->frags++;
+ atomic_long_inc(&pfp->frags);
- linear_page->frags++;
+ atomic_long_inc(&linear_page->frags);
}
return NULL; /* page/packet was consumed by XDP */
}
@@ -2035,7 +2036,7 @@ mlx5e_skb_from_cqe_mpwrq_nonlinear(struct mlx5e_rq *rq, struct mlx5e_mpw_info *w
return NULL;
skb_mark_for_recycle(skb);
- linear_page->frags++;
+ atomic_long_inc(&linear_page->frags);
if (xdp_buff_has_frags(&mxbuf->xdp)) {
struct mlx5e_frag_page *pagep;
@@ -2048,7 +2049,7 @@ mlx5e_skb_from_cqe_mpwrq_nonlinear(struct mlx5e_rq *rq, struct mlx5e_mpw_info *w
pagep = head_page;
do
- pagep->frags++;
+ atomic_long_inc(&pagep->frags);
while (++pagep < frag_page);
headlen = min_t(u16, MLX5E_RX_MAX_HEAD - len,
@@ -2068,7 +2069,7 @@ mlx5e_skb_from_cqe_mpwrq_nonlinear(struct mlx5e_rq *rq, struct mlx5e_mpw_info *w
pagep = frag_page - sinfo->nr_frags;
do
- pagep->frags++;
+ atomic_long_inc(&pagep->frags);
while (++pagep < frag_page);
}
/* copy header */
@@ -2121,7 +2122,7 @@ mlx5e_skb_from_cqe_mpwrq_linear(struct mlx5e_rq *rq, struct mlx5e_mpw_info *wi,
cqe_bcnt, mxbuf);
if (mlx5e_xdp_handle(rq, prog, mxbuf)) {
if (__test_and_clear_bit(MLX5E_RQ_FLAG_XDP_XMIT, rq->flags))
- frag_page->frags++;
+ atomic_long_inc(&frag_page->frags);
return NULL; /* page/packet was consumed by XDP */
}
@@ -2136,7 +2137,7 @@ mlx5e_skb_from_cqe_mpwrq_linear(struct mlx5e_rq *rq, struct mlx5e_mpw_info *wi,
/* queue up for recycling/reuse */
skb_mark_for_recycle(skb);
- frag_page->frags++;
+ atomic_long_inc(&frag_page->frags);
return skb;
}
--
2.43.0
^ permalink raw reply related [flat|nested] 2+ messages in thread