* [PATCH net-next 01/11] ipvs: Replace use of system_unbound_wq with system_dfl_long_wq
2026-06-14 11:45 [PATCH net-next 00/11] Netfilter/IPVS updates for net-next Pablo Neira Ayuso
@ 2026-06-14 11:45 ` Pablo Neira Ayuso
2026-06-14 11:45 ` [PATCH net-next 02/11] netfilter: nf_tables: use DEBUG_NET_WARN_ON_ONCE in packet and control paths Pablo Neira Ayuso
` (9 subsequent siblings)
10 siblings, 0 replies; 12+ messages in thread
From: Pablo Neira Ayuso @ 2026-06-14 11:45 UTC (permalink / raw)
To: netfilter-devel; +Cc: davem, netdev, kuba, pabeni, edumazet, fw, horms
From: Marco Crivellari <marco.crivellari@suse.com>
This patch continues the effort to refactor workqueue APIs, which has
begun with the changes introducing new workqueues and a new
alloc_workqueue flag:
commit 128ea9f6ccfb ("workqueue: Add system_percpu_wq and system_dfl_wq")
commit 930c2ea566af ("workqueue: Add new WQ_PERCPU flag")
The point of the refactoring is to eventually alter the default behavior
of workqueues to become unbound by default so that their workload
placement is optimized by the scheduler.
Before that to happen, workqueue users must be converted to the better
named new workqueues with no intended behaviour changes:
system_wq -> system_percpu_wq
system_unbound_wq -> system_dfl_wq
This way the old obsolete workqueues (system_wq, system_unbound_wq) can
be removed in the future.
This specific work is considered long, so enqueue it using
system_dfl_long_wq instead of system_dfl_wq.
Link: https://lore.kernel.org/all/20250221112003.1dSuoGyc@linutronix.de/
Suggested-by: Tejun Heo <tj@kernel.org>
Signed-off-by: Marco Crivellari <marco.crivellari@suse.com>
Acked-by: Julian Anastasov <ja@ssi.bg>
Signed-off-by: Pablo Neira Ayuso <pablo@netfilter.org>
---
net/netfilter/ipvs/ip_vs_conn.c | 4 ++--
net/netfilter/ipvs/ip_vs_ctl.c | 10 +++++-----
2 files changed, 7 insertions(+), 7 deletions(-)
diff --git a/net/netfilter/ipvs/ip_vs_conn.c b/net/netfilter/ipvs/ip_vs_conn.c
index e76a73d183d5..cb36641f8d1c 100644
--- a/net/netfilter/ipvs/ip_vs_conn.c
+++ b/net/netfilter/ipvs/ip_vs_conn.c
@@ -285,7 +285,7 @@ static inline int ip_vs_conn_hash(struct ip_vs_conn *cp)
/* Schedule resizing if load increases */
if (atomic_read(&ipvs->conn_count) > t->u_thresh &&
!test_and_set_bit(IP_VS_WORK_CONN_RESIZE, &ipvs->work_flags))
- mod_delayed_work(system_unbound_wq, &ipvs->conn_resize_work, 0);
+ mod_delayed_work(system_dfl_long_wq, &ipvs->conn_resize_work, 0);
return ret;
}
@@ -916,7 +916,7 @@ static void conn_resize_work_handler(struct work_struct *work)
out:
/* Monitor if we need to shrink table */
- queue_delayed_work(system_unbound_wq, &ipvs->conn_resize_work,
+ queue_delayed_work(system_dfl_long_wq, &ipvs->conn_resize_work,
more_work ? 1 : 2 * HZ);
}
diff --git a/net/netfilter/ipvs/ip_vs_ctl.c b/net/netfilter/ipvs/ip_vs_ctl.c
index f765d1506839..bcf40b8c41cf 100644
--- a/net/netfilter/ipvs/ip_vs_ctl.c
+++ b/net/netfilter/ipvs/ip_vs_ctl.c
@@ -821,7 +821,7 @@ static void svc_resize_work_handler(struct work_struct *work)
if (!READ_ONCE(ipvs->enable) || !more_work ||
test_bit(IP_VS_WORK_SVC_NORESIZE, &ipvs->work_flags))
return;
- queue_delayed_work(system_unbound_wq, &ipvs->svc_resize_work, 1);
+ queue_delayed_work(system_dfl_long_wq, &ipvs->svc_resize_work, 1);
return;
unlock_m:
@@ -1869,7 +1869,7 @@ ip_vs_add_service(struct netns_ipvs *ipvs, struct ip_vs_service_user_kern *u,
/* Schedule resize work */
if (grow && !test_and_set_bit(IP_VS_WORK_SVC_RESIZE, &ipvs->work_flags))
- queue_delayed_work(system_unbound_wq, &ipvs->svc_resize_work,
+ queue_delayed_work(system_dfl_long_wq, &ipvs->svc_resize_work,
1);
*svc_p = svc;
@@ -2125,7 +2125,7 @@ static int ip_vs_del_service(struct ip_vs_service *svc)
rcu_read_unlock();
if (shrink && !test_and_set_bit(IP_VS_WORK_SVC_RESIZE,
&ipvs->work_flags))
- queue_delayed_work(system_unbound_wq,
+ queue_delayed_work(system_dfl_long_wq,
&ipvs->svc_resize_work, 1);
}
return 0;
@@ -2606,7 +2606,7 @@ static int ipvs_proc_conn_lfactor(const struct ctl_table *table, int write,
} else {
WRITE_ONCE(*valp, val);
if (rcu_access_pointer(ipvs->conn_tab))
- mod_delayed_work(system_unbound_wq,
+ mod_delayed_work(system_dfl_long_wq,
&ipvs->conn_resize_work, 0);
}
}
@@ -2638,7 +2638,7 @@ static int ipvs_proc_svc_lfactor(const struct ctl_table *table, int write,
READ_ONCE(ipvs->enable) &&
!test_bit(IP_VS_WORK_SVC_NORESIZE,
&ipvs->work_flags))
- mod_delayed_work(system_unbound_wq,
+ mod_delayed_work(system_dfl_long_wq,
&ipvs->svc_resize_work, 0);
mutex_unlock(&ipvs->service_mutex);
}
--
2.47.3
^ permalink raw reply related [flat|nested] 12+ messages in thread* [PATCH net-next 02/11] netfilter: nf_tables: use DEBUG_NET_WARN_ON_ONCE in packet and control paths
2026-06-14 11:45 [PATCH net-next 00/11] Netfilter/IPVS updates for net-next Pablo Neira Ayuso
2026-06-14 11:45 ` [PATCH net-next 01/11] ipvs: Replace use of system_unbound_wq with system_dfl_long_wq Pablo Neira Ayuso
@ 2026-06-14 11:45 ` Pablo Neira Ayuso
2026-06-14 11:45 ` [PATCH net-next 03/11] netfilter: nf_conncount: callers must hold rcu read lock Pablo Neira Ayuso
` (8 subsequent siblings)
10 siblings, 0 replies; 12+ messages in thread
From: Pablo Neira Ayuso @ 2026-06-14 11:45 UTC (permalink / raw)
To: netfilter-devel; +Cc: davem, netdev, kuba, pabeni, edumazet, fw, horms
From: Fernando Fernandez Mancera <fmancera@suse.de>
Replace raw warning macros with DEBUG_NET_WARN_ON_ONCE across the
nf_tables API, core engine, and expression evaluations. This prevents
unnecessary system panics when panic_on_warn=1 is enabled in production
systems.
Signed-off-by: Fernando Fernandez Mancera <fmancera@suse.de>
Signed-off-by: Pablo Neira Ayuso <pablo@netfilter.org>
---
net/netfilter/nf_tables_api.c | 38 +++++++++++++++++++++++--------
net/netfilter/nf_tables_core.c | 8 ++++---
net/netfilter/nf_tables_offload.c | 2 +-
net/netfilter/nf_tables_trace.c | 6 +++--
net/netfilter/nft_ct.c | 2 +-
net/netfilter/nft_ct_fast.c | 2 +-
net/netfilter/nft_exthdr.c | 2 +-
net/netfilter/nft_fib.c | 2 +-
net/netfilter/nft_inner.c | 2 +-
net/netfilter/nft_lookup.c | 2 +-
net/netfilter/nft_masq.c | 2 +-
net/netfilter/nft_meta.c | 10 ++++----
net/netfilter/nft_payload.c | 6 ++---
net/netfilter/nft_redir.c | 2 +-
net/netfilter/nft_reject.c | 8 +++++--
net/netfilter/nft_rt.c | 2 +-
net/netfilter/nft_set_hash.c | 2 +-
net/netfilter/nft_set_pipapo.c | 2 +-
net/netfilter/nft_set_rbtree.c | 6 +++--
net/netfilter/nft_socket.c | 8 ++++---
net/netfilter/nft_tunnel.c | 2 +-
net/netfilter/nft_xfrm.c | 6 ++---
22 files changed, 76 insertions(+), 46 deletions(-)
diff --git a/net/netfilter/nf_tables_api.c b/net/netfilter/nf_tables_api.c
index 87387adbca65..4884f7f7aaee 100644
--- a/net/netfilter/nf_tables_api.c
+++ b/net/netfilter/nf_tables_api.c
@@ -3378,8 +3378,10 @@ static int nf_tables_delchain(struct sk_buff *skb, const struct nfnl_info *info,
*/
int nft_register_expr(struct nft_expr_type *type)
{
- if (WARN_ON_ONCE(type->maxattr > NFT_EXPR_MAXATTR))
+ if (unlikely(type->maxattr > NFT_EXPR_MAXATTR)) {
+ DEBUG_NET_WARN_ON_ONCE(1);
return -ENOMEM;
+ }
nfnl_lock(NFNL_SUBSYS_NFTABLES);
if (type->family == NFPROTO_UNSPEC)
@@ -3691,8 +3693,10 @@ int nft_expr_clone(struct nft_expr *dst, struct nft_expr *src, gfp_t gfp)
{
int err;
- if (WARN_ON_ONCE(!src->ops->clone))
+ if (unlikely(!src->ops->clone)) {
+ DEBUG_NET_WARN_ON_ONCE(1);
return -EINVAL;
+ }
dst->ops = src->ops;
err = src->ops->clone(dst, src, gfp);
@@ -8327,8 +8331,10 @@ static int nf_tables_newobj(struct sk_buff *skb, const struct nfnl_info *info,
return 0;
type = nft_obj_type_get(net, objtype, family);
- if (WARN_ON_ONCE(IS_ERR(type)))
+ if (IS_ERR(type)) {
+ DEBUG_NET_WARN_ON_ONCE(1);
return PTR_ERR(type);
+ }
nft_ctx_init(&ctx, net, skb, info->nlh, family, table, NULL, nla);
@@ -10306,19 +10312,25 @@ static int nf_tables_commit_chain_prepare(struct net *net, struct nft_chain *cha
prule = (struct nft_rule_dp *)data;
data += offsetof(struct nft_rule_dp, data);
- if (WARN_ON_ONCE(data > data_boundary))
+ if (unlikely(data > data_boundary)) {
+ DEBUG_NET_WARN_ON_ONCE(1);
return -ENOMEM;
+ }
size = 0;
nft_rule_for_each_expr(expr, last, rule) {
- if (WARN_ON_ONCE(data + size + expr->ops->size > data_boundary))
+ if (unlikely(data + size + expr->ops->size > data_boundary)) {
+ DEBUG_NET_WARN_ON_ONCE(1);
return -ENOMEM;
+ }
memcpy(data + size, expr, expr->ops->size);
size += expr->ops->size;
}
- if (WARN_ON_ONCE(size >= 1 << 12))
+ if (unlikely(size >= 1 << 12)) {
+ DEBUG_NET_WARN_ON_ONCE(1);
return -ENOMEM;
+ }
prule->handle = rule->handle;
prule->dlen = size;
@@ -10329,8 +10341,10 @@ static int nf_tables_commit_chain_prepare(struct net *net, struct nft_chain *cha
chain->blob_next->size += (unsigned long)(data - (void *)prule);
}
- if (WARN_ON_ONCE(data > data_boundary))
+ if (unlikely(data > data_boundary)) {
+ DEBUG_NET_WARN_ON_ONCE(1);
return -ENOMEM;
+ }
prule = (struct nft_rule_dp *)data;
nft_last_rule(chain, prule);
@@ -11636,8 +11650,10 @@ int nft_parse_register_load(const struct nft_ctx *ctx,
next_register = DIV_ROUND_UP(len, NFT_REG32_SIZE) + reg;
/* Can't happen: nft_validate_register_load() should have failed */
- if (WARN_ON_ONCE(next_register > NFT_REG32_NUM))
+ if (unlikely(next_register > NFT_REG32_NUM)) {
+ DEBUG_NET_WARN_ON_ONCE(1);
return -EINVAL;
+ }
/* find first register that did not see an earlier store. */
invalid_reg = find_next_zero_bit(ctx->reg_inited, NFT_REG32_NUM, reg);
@@ -11884,8 +11900,10 @@ int nft_data_init(const struct nft_ctx *ctx, struct nft_data *data,
struct nlattr *tb[NFTA_DATA_MAX + 1];
int err;
- if (WARN_ON_ONCE(!desc->size))
+ if (unlikely(!desc->size)) {
+ DEBUG_NET_WARN_ON_ONCE(1);
return -EINVAL;
+ }
err = nla_parse_nested_deprecated(tb, NFTA_DATA_MAX, nla,
nft_data_policy, NULL);
@@ -11950,7 +11968,7 @@ int nft_data_dump(struct sk_buff *skb, int attr, const struct nft_data *data,
break;
default:
err = -EINVAL;
- WARN_ON(1);
+ DEBUG_NET_WARN_ON_ONCE(1);
}
nla_nest_end(skb, nest);
diff --git a/net/netfilter/nf_tables_core.c b/net/netfilter/nf_tables_core.c
index 8ab186f86dd4..01a72f334dc6 100644
--- a/net/netfilter/nf_tables_core.c
+++ b/net/netfilter/nf_tables_core.c
@@ -314,8 +314,10 @@ nft_do_chain(struct nft_pktinfo *pkt, void *priv)
switch (regs.verdict.code) {
case NFT_JUMP:
- if (WARN_ON_ONCE(stackptr >= NFT_JUMP_STACK_SIZE))
- return NF_DROP;
+ if (unlikely(stackptr >= NFT_JUMP_STACK_SIZE)) {
+ DEBUG_NET_WARN_ON_ONCE(1);
+ return NF_DROP_REASON(pkt->skb, SKB_DROP_REASON_NETFILTER_DROP, ELOOP);
+ }
jumpstack[stackptr].rule = nft_rule_next(rule);
stackptr++;
fallthrough;
@@ -326,7 +328,7 @@ nft_do_chain(struct nft_pktinfo *pkt, void *priv)
case NFT_RETURN:
break;
default:
- WARN_ON_ONCE(1);
+ DEBUG_NET_WARN_ON_ONCE(1);
}
if (stackptr > 0) {
diff --git a/net/netfilter/nf_tables_offload.c b/net/netfilter/nf_tables_offload.c
index 9101b1703b52..8998a24651ff 100644
--- a/net/netfilter/nf_tables_offload.c
+++ b/net/netfilter/nf_tables_offload.c
@@ -361,7 +361,7 @@ static int nft_block_setup(struct nft_base_chain *basechain,
err = nft_flow_offload_unbind(bo, basechain);
break;
default:
- WARN_ON_ONCE(1);
+ DEBUG_NET_WARN_ON_ONCE(1);
err = -EOPNOTSUPP;
}
diff --git a/net/netfilter/nf_tables_trace.c b/net/netfilter/nf_tables_trace.c
index a88abae5a9de..d85b6a2fb43c 100644
--- a/net/netfilter/nf_tables_trace.c
+++ b/net/netfilter/nf_tables_trace.c
@@ -227,8 +227,10 @@ static const struct nft_chain *nft_trace_get_chain(const struct nft_rule_dp *rul
last = (const struct nft_rule_dp_last *)rule;
- if (WARN_ON_ONCE(!last->chain))
+ if (unlikely(!last->chain)) {
+ DEBUG_NET_WARN_ON_ONCE(1);
return &info->basechain->chain;
+ }
return last->chain;
}
@@ -354,7 +356,7 @@ void nft_trace_notify(const struct nft_pktinfo *pkt,
return;
nla_put_failure:
- WARN_ON_ONCE(1);
+ DEBUG_NET_WARN_ON_ONCE(1);
kfree_skb(skb);
}
diff --git a/net/netfilter/nft_ct.c b/net/netfilter/nft_ct.c
index 9fe179d688da..25934c6f01fb 100644
--- a/net/netfilter/nft_ct.c
+++ b/net/netfilter/nft_ct.c
@@ -1132,7 +1132,7 @@ static void nft_ct_helper_obj_eval(struct nft_object *obj,
to_assign = priv->helper6;
break;
default:
- WARN_ON_ONCE(1);
+ DEBUG_NET_WARN_ON_ONCE(1);
return;
}
diff --git a/net/netfilter/nft_ct_fast.c b/net/netfilter/nft_ct_fast.c
index ecf7b3a404be..a44524c4fe63 100644
--- a/net/netfilter/nft_ct_fast.c
+++ b/net/netfilter/nft_ct_fast.c
@@ -53,7 +53,7 @@ void nft_ct_get_fast_eval(const struct nft_expr *expr,
return;
#endif
default:
- WARN_ON_ONCE(1);
+ DEBUG_NET_WARN_ON_ONCE(1);
regs->verdict.code = NFT_BREAK;
break;
}
diff --git a/net/netfilter/nft_exthdr.c b/net/netfilter/nft_exthdr.c
index e6a07c0df207..8861b4d191d1 100644
--- a/net/netfilter/nft_exthdr.c
+++ b/net/netfilter/nft_exthdr.c
@@ -298,7 +298,7 @@ static void nft_exthdr_tcp_set_eval(const struct nft_expr *expr,
old.v32, new.v32, false);
break;
default:
- WARN_ON_ONCE(1);
+ DEBUG_NET_WARN_ON_ONCE(1);
break;
}
diff --git a/net/netfilter/nft_fib.c b/net/netfilter/nft_fib.c
index 327a5f33659c..1d0d815c8745 100644
--- a/net/netfilter/nft_fib.c
+++ b/net/netfilter/nft_fib.c
@@ -155,7 +155,7 @@ void nft_fib_store_result(void *reg, const struct nft_fib *priv,
strscpy_pad(reg, dev ? dev->name : "", IFNAMSIZ);
break;
default:
- WARN_ON_ONCE(1);
+ DEBUG_NET_WARN_ON_ONCE(1);
*dreg = 0;
break;
}
diff --git a/net/netfilter/nft_inner.c b/net/netfilter/nft_inner.c
index d14ca157910b..97fb4eea2d66 100644
--- a/net/netfilter/nft_inner.c
+++ b/net/netfilter/nft_inner.c
@@ -308,7 +308,7 @@ static void nft_inner_eval(const struct nft_expr *expr, struct nft_regs *regs,
nft_meta_inner_eval((struct nft_expr *)&priv->expr, regs, pkt, &tun_ctx);
break;
default:
- WARN_ON_ONCE(1);
+ DEBUG_NET_WARN_ON_ONCE(1);
goto err;
}
nft_inner_save_tun_ctx(pkt, &tun_ctx);
diff --git a/net/netfilter/nft_lookup.c b/net/netfilter/nft_lookup.c
index 9fafe5afc490..ba512e94b402 100644
--- a/net/netfilter/nft_lookup.c
+++ b/net/netfilter/nft_lookup.c
@@ -50,7 +50,7 @@ __nft_set_do_lookup(const struct net *net, const struct nft_set *set,
if (set->ops == &nft_set_rbtree_type.ops)
return nft_rbtree_lookup(net, set, key);
- WARN_ON_ONCE(1);
+ DEBUG_NET_WARN_ON_ONCE(1);
#endif
return set->ops->lookup(net, set, key);
}
diff --git a/net/netfilter/nft_masq.c b/net/netfilter/nft_masq.c
index 2b01128737a3..841efd981e20 100644
--- a/net/netfilter/nft_masq.c
+++ b/net/netfilter/nft_masq.c
@@ -123,7 +123,7 @@ static void nft_masq_eval(const struct nft_expr *expr,
break;
#endif
default:
- WARN_ON_ONCE(1);
+ DEBUG_NET_WARN_ON_ONCE(1);
break;
}
}
diff --git a/net/netfilter/nft_meta.c b/net/netfilter/nft_meta.c
index 5b25851381e5..9b5821c64442 100644
--- a/net/netfilter/nft_meta.c
+++ b/net/netfilter/nft_meta.c
@@ -116,12 +116,12 @@ nft_meta_get_eval_pkttype_lo(const struct nft_pktinfo *pkt,
nft_reg_store8(dest, PACKET_MULTICAST);
break;
default:
- WARN_ON_ONCE(1);
+ DEBUG_NET_WARN_ON_ONCE(1);
return false;
}
break;
default:
- WARN_ON_ONCE(1);
+ DEBUG_NET_WARN_ON_ONCE(1);
return false;
}
@@ -460,7 +460,7 @@ void nft_meta_get_eval(const struct nft_expr *expr,
nft_meta_get_eval_sdifname(dest, pkt);
break;
default:
- WARN_ON(1);
+ DEBUG_NET_WARN_ON_ONCE(1);
goto err;
}
return;
@@ -506,7 +506,7 @@ void nft_meta_set_eval(const struct nft_expr *expr,
break;
#endif
default:
- WARN_ON(1);
+ DEBUG_NET_WARN_ON_ONCE(1);
}
}
EXPORT_SYMBOL_GPL(nft_meta_set_eval);
@@ -886,7 +886,7 @@ void nft_meta_inner_eval(const struct nft_expr *expr,
nft_reg_store8(dest, tun_ctx->l4proto);
break;
default:
- WARN_ON_ONCE(1);
+ DEBUG_NET_WARN_ON_ONCE(1);
goto err;
}
return;
diff --git a/net/netfilter/nft_payload.c b/net/netfilter/nft_payload.c
index 484a5490832e..ef2a80dfc68f 100644
--- a/net/netfilter/nft_payload.c
+++ b/net/netfilter/nft_payload.c
@@ -196,7 +196,7 @@ void nft_payload_eval(const struct nft_expr *expr,
goto err;
break;
default:
- WARN_ON_ONCE(1);
+ DEBUG_NET_WARN_ON_ONCE(1);
goto err;
}
offset += priv->offset;
@@ -603,7 +603,7 @@ void nft_payload_inner_eval(const struct nft_expr *expr, struct nft_regs *regs,
offset = tun_ctx->inner_thoff;
break;
default:
- WARN_ON_ONCE(1);
+ DEBUG_NET_WARN_ON_ONCE(1);
goto err;
}
offset += priv->offset;
@@ -866,7 +866,7 @@ static void nft_payload_set_eval(const struct nft_expr *expr,
goto err;
break;
default:
- WARN_ON_ONCE(1);
+ DEBUG_NET_WARN_ON_ONCE(1);
goto err;
}
diff --git a/net/netfilter/nft_redir.c b/net/netfilter/nft_redir.c
index 58ae802db8f5..a98aa28180fb 100644
--- a/net/netfilter/nft_redir.c
+++ b/net/netfilter/nft_redir.c
@@ -126,7 +126,7 @@ static void nft_redir_eval(const struct nft_expr *expr,
break;
#endif
default:
- WARN_ON_ONCE(1);
+ DEBUG_NET_WARN_ON_ONCE(1);
break;
}
}
diff --git a/net/netfilter/nft_reject.c b/net/netfilter/nft_reject.c
index 196a92c7ea09..e3972e904cf0 100644
--- a/net/netfilter/nft_reject.c
+++ b/net/netfilter/nft_reject.c
@@ -102,8 +102,10 @@ static u8 icmp_code_v4[NFT_REJECT_ICMPX_MAX + 1] = {
int nft_reject_icmp_code(u8 code)
{
- if (WARN_ON_ONCE(code > NFT_REJECT_ICMPX_MAX))
+ if (unlikely(code > NFT_REJECT_ICMPX_MAX)) {
+ DEBUG_NET_WARN_ON_ONCE(1);
return ICMP_NET_UNREACH;
+ }
return icmp_code_v4[code];
}
@@ -120,8 +122,10 @@ static u8 icmp_code_v6[NFT_REJECT_ICMPX_MAX + 1] = {
int nft_reject_icmpv6_code(u8 code)
{
- if (WARN_ON_ONCE(code > NFT_REJECT_ICMPX_MAX))
+ if (unlikely(code > NFT_REJECT_ICMPX_MAX)) {
+ DEBUG_NET_WARN_ON_ONCE(1);
return ICMPV6_NOROUTE;
+ }
return icmp_code_v6[code];
}
diff --git a/net/netfilter/nft_rt.c b/net/netfilter/nft_rt.c
index e23cd4759851..aeb0094eafd8 100644
--- a/net/netfilter/nft_rt.c
+++ b/net/netfilter/nft_rt.c
@@ -93,7 +93,7 @@ void nft_rt_get_eval(const struct nft_expr *expr,
break;
#endif
default:
- WARN_ON(1);
+ DEBUG_NET_WARN_ON_ONCE(1);
goto err;
}
return;
diff --git a/net/netfilter/nft_set_hash.c b/net/netfilter/nft_set_hash.c
index b0e571c8e3f3..eb4e382119d4 100644
--- a/net/netfilter/nft_set_hash.c
+++ b/net/netfilter/nft_set_hash.c
@@ -385,7 +385,7 @@ static void nft_rhash_walk(const struct nft_ctx *ctx, struct nft_set *set,
break;
default:
iter->err = -EINVAL;
- WARN_ON_ONCE(1);
+ DEBUG_NET_WARN_ON_ONCE(1);
break;
}
}
diff --git a/net/netfilter/nft_set_pipapo.c b/net/netfilter/nft_set_pipapo.c
index 50d4a4f04309..706c78853f24 100644
--- a/net/netfilter/nft_set_pipapo.c
+++ b/net/netfilter/nft_set_pipapo.c
@@ -2199,7 +2199,7 @@ static void nft_pipapo_walk(const struct nft_ctx *ctx, struct nft_set *set,
break;
default:
iter->err = -EINVAL;
- WARN_ON_ONCE(1);
+ DEBUG_NET_WARN_ON_ONCE(1);
break;
}
}
diff --git a/net/netfilter/nft_set_rbtree.c b/net/netfilter/nft_set_rbtree.c
index b4f0b5fdf1f2..018bbb6df4ce 100644
--- a/net/netfilter/nft_set_rbtree.c
+++ b/net/netfilter/nft_set_rbtree.c
@@ -654,8 +654,10 @@ static int nft_array_may_resize(const struct nft_set *set, bool flush)
}
realloc_array:
- if (WARN_ON_ONCE(nelems > new_max_intervals))
+ if (unlikely(nelems > new_max_intervals)) {
+ DEBUG_NET_WARN_ON_ONCE(1);
return -ENOMEM;
+ }
if (priv->array_next) {
if (max_intervals == new_max_intervals)
@@ -878,7 +880,7 @@ static void nft_rbtree_walk(const struct nft_ctx *ctx,
break;
default:
iter->err = -EINVAL;
- WARN_ON_ONCE(1);
+ DEBUG_NET_WARN_ON_ONCE(1);
break;
}
}
diff --git a/net/netfilter/nft_socket.c b/net/netfilter/nft_socket.c
index a146a45d7531..52d892e04261 100644
--- a/net/netfilter/nft_socket.c
+++ b/net/netfilter/nft_socket.c
@@ -71,8 +71,10 @@ static noinline int nft_socket_cgroup_subtree_level(void)
if (level > 255)
return -ERANGE;
- if (WARN_ON_ONCE(level < 0))
+ if (unlikely(level < 0)) {
+ DEBUG_NET_WARN_ON_ONCE(1);
return -EINVAL;
+ }
return level;
}
@@ -97,7 +99,7 @@ static struct sock *nft_socket_do_lookup(const struct nft_pktinfo *pkt)
break;
#endif
default:
- WARN_ON_ONCE(1);
+ DEBUG_NET_WARN_ON_ONCE(1);
break;
}
@@ -152,7 +154,7 @@ static void nft_socket_eval(const struct nft_expr *expr,
break;
#endif
default:
- WARN_ON(1);
+ DEBUG_NET_WARN_ON_ONCE(1);
regs->verdict.code = NFT_BREAK;
}
diff --git a/net/netfilter/nft_tunnel.c b/net/netfilter/nft_tunnel.c
index 68f7cfbbee06..0a018d4706a9 100644
--- a/net/netfilter/nft_tunnel.c
+++ b/net/netfilter/nft_tunnel.c
@@ -60,7 +60,7 @@ static void nft_tunnel_get_eval(const struct nft_expr *expr,
regs->verdict.code = NFT_BREAK;
break;
default:
- WARN_ON(1);
+ DEBUG_NET_WARN_ON_ONCE(1);
regs->verdict.code = NFT_BREAK;
}
}
diff --git a/net/netfilter/nft_xfrm.c b/net/netfilter/nft_xfrm.c
index 65a75d88e5f0..8cec43064319 100644
--- a/net/netfilter/nft_xfrm.c
+++ b/net/netfilter/nft_xfrm.c
@@ -132,7 +132,7 @@ static void nft_xfrm_state_get_key(const struct nft_xfrm *priv,
switch (priv->key) {
case NFT_XFRM_KEY_UNSPEC:
case __NFT_XFRM_KEY_MAX:
- WARN_ON_ONCE(1);
+ DEBUG_NET_WARN_ON_ONCE(1);
break;
case NFT_XFRM_KEY_DADDR_IP4:
*dest = (__force __u32)state->id.daddr.a4;
@@ -206,7 +206,7 @@ static void nft_xfrm_get_eval(const struct nft_expr *expr,
nft_xfrm_get_eval_out(priv, regs, pkt);
break;
default:
- WARN_ON_ONCE(1);
+ DEBUG_NET_WARN_ON_ONCE(1);
regs->verdict.code = NFT_BREAK;
break;
}
@@ -252,7 +252,7 @@ static int nft_xfrm_validate(const struct nft_ctx *ctx, const struct nft_expr *e
(1 << NF_INET_POST_ROUTING);
break;
default:
- WARN_ON_ONCE(1);
+ DEBUG_NET_WARN_ON_ONCE(1);
return -EINVAL;
}
--
2.47.3
^ permalink raw reply related [flat|nested] 12+ messages in thread* [PATCH net-next 03/11] netfilter: nf_conncount: callers must hold rcu read lock
2026-06-14 11:45 [PATCH net-next 00/11] Netfilter/IPVS updates for net-next Pablo Neira Ayuso
2026-06-14 11:45 ` [PATCH net-next 01/11] ipvs: Replace use of system_unbound_wq with system_dfl_long_wq Pablo Neira Ayuso
2026-06-14 11:45 ` [PATCH net-next 02/11] netfilter: nf_tables: use DEBUG_NET_WARN_ON_ONCE in packet and control paths Pablo Neira Ayuso
@ 2026-06-14 11:45 ` Pablo Neira Ayuso
2026-06-14 11:45 ` [PATCH net-next 04/11] netfilter: nf_conncount: use per nf_conncount_data spinlocks Pablo Neira Ayuso
` (7 subsequent siblings)
10 siblings, 0 replies; 12+ messages in thread
From: Pablo Neira Ayuso @ 2026-06-14 11:45 UTC (permalink / raw)
To: netfilter-devel; +Cc: davem, netdev, kuba, pabeni, edumazet, fw, horms
From: Florian Westphal <fw@strlen.de>
rcu_derefence_raw() should not have been used here, it concealed this bug.
Its used because struct rb_node lacks __rcu annotated pointers, so plain
rcu_derefence causes sparse warnings.
The major tradeoff is that rcu_derefence_raw() doesn't warn when the caller
isn't in a rcu read section.
Extend the rcu read lock scope accordingly and cause sparse warnings,
those warnings are the lesser evil.
Fixes: 11efd5cb04a1 ("openvswitch: Support conntrack zone limit")
Closes: https://sashiko.dev/#/patchset/20260603230610.7900-1-fw%40strlen.de
Signed-off-by: Florian Westphal <fw@strlen.de>
Signed-off-by: Pablo Neira Ayuso <pablo@netfilter.org>
---
net/netfilter/nf_conncount.c | 6 +++---
net/openvswitch/conntrack.c | 2 +-
2 files changed, 4 insertions(+), 4 deletions(-)
diff --git a/net/netfilter/nf_conncount.c b/net/netfilter/nf_conncount.c
index ab28b47395bd..81e4a4e20df5 100644
--- a/net/netfilter/nf_conncount.c
+++ b/net/netfilter/nf_conncount.c
@@ -499,7 +499,7 @@ count_tree(struct net *net,
hash = jhash2(key, data->keylen, data->initval) % CONNCOUNT_SLOTS;
root = &data->root[hash];
- parent = rcu_dereference_raw(root->rb_node);
+ parent = rcu_dereference(root->rb_node);
while (parent) {
int diff;
@@ -507,9 +507,9 @@ count_tree(struct net *net,
diff = key_diff(key, rbconn->key, data->keylen);
if (diff < 0) {
- parent = rcu_dereference_raw(parent->rb_left);
+ parent = rcu_dereference(parent->rb_left);
} else if (diff > 0) {
- parent = rcu_dereference_raw(parent->rb_right);
+ parent = rcu_dereference(parent->rb_right);
} else {
int ret;
diff --git a/net/openvswitch/conntrack.c b/net/openvswitch/conntrack.c
index 7c9256572284..c6fd9c424e8f 100644
--- a/net/openvswitch/conntrack.c
+++ b/net/openvswitch/conntrack.c
@@ -1797,10 +1797,10 @@ static int ovs_ct_limit_get_zone_limit(struct net *net,
} else {
rcu_read_lock();
limit = ct_limit_get(info, zone);
- rcu_read_unlock();
err = __ovs_ct_limit_get_zone_limit(
net, info->data, zone, limit, reply);
+ rcu_read_unlock();
if (err)
return err;
}
--
2.47.3
^ permalink raw reply related [flat|nested] 12+ messages in thread* [PATCH net-next 04/11] netfilter: nf_conncount: use per nf_conncount_data spinlocks
2026-06-14 11:45 [PATCH net-next 00/11] Netfilter/IPVS updates for net-next Pablo Neira Ayuso
` (2 preceding siblings ...)
2026-06-14 11:45 ` [PATCH net-next 03/11] netfilter: nf_conncount: callers must hold rcu read lock Pablo Neira Ayuso
@ 2026-06-14 11:45 ` Pablo Neira Ayuso
2026-06-14 11:45 ` [PATCH net-next 05/11] netfilter: nf_conncount: split count_tree_node rbtree walk into helper Pablo Neira Ayuso
` (6 subsequent siblings)
10 siblings, 0 replies; 12+ messages in thread
From: Pablo Neira Ayuso @ 2026-06-14 11:45 UTC (permalink / raw)
To: netfilter-devel; +Cc: davem, netdev, kuba, pabeni, edumazet, fw, horms
From: Florian Westphal <fw@strlen.de>
This change replaces the rb_root with a new container structure.
Instead of an array of locks shared by all nf_conncount_data objects,
each tree gains its own dedicated lock.
Downside: nf_conncount_data increases in size. Before this change:
struct nf_conncount_data {
[..]
/* --- cacheline 33 boundary (2112 bytes) was 16 bytes ago --- */
unsigned int gc_tree; /* 2128 4 */
/* size: 2136, cachelines: 34, members: 7 */
/* padding: 4 */
After:
/* size: 4184, cachelines: 66, members: 7 */
/* padding: 4 */
On LOCKDEP enabled kernels, this is even worse:
/* size: 18560, cachelines: 290, members: 7 */
(due to lockdep map in each spinlock).
For this reason also switch to kvzalloc. The zeroing variant is needed
to not start with random (heap memory content) in the ->pending_trees
bitmap.
Followup patch will add and use a sequence counter.
Assisted-by: Claude:claude-sonnet-4-6
Signed-off-by: Florian Westphal <fw@strlen.de>
Signed-off-by: Pablo Neira Ayuso <pablo@netfilter.org>
---
net/netfilter/nf_conncount.c | 63 +++++++++++++++++++-----------------
1 file changed, 34 insertions(+), 29 deletions(-)
diff --git a/net/netfilter/nf_conncount.c b/net/netfilter/nf_conncount.c
index 81e4a4e20df5..faecc05d34d4 100644
--- a/net/netfilter/nf_conncount.c
+++ b/net/netfilter/nf_conncount.c
@@ -54,12 +54,15 @@ struct nf_conncount_rb {
struct rcu_head rcu_head;
};
-static spinlock_t nf_conncount_locks[CONNCOUNT_SLOTS] __cacheline_aligned_in_smp;
+struct nf_conncount_root {
+ struct rb_root root;
+ spinlock_t lock;
+};
struct nf_conncount_data {
unsigned int keylen;
u32 initval;
- struct rb_root root[CONNCOUNT_SLOTS];
+ struct nf_conncount_root root[CONNCOUNT_SLOTS];
struct net *net;
struct work_struct gc_work;
unsigned long pending_trees[BITS_TO_LONGS(CONNCOUNT_SLOTS)];
@@ -367,18 +370,19 @@ static void __tree_nodes_free(struct rcu_head *h)
kmem_cache_free(conncount_rb_cachep, rbconn);
}
-/* caller must hold tree nf_conncount_locks[] lock */
-static void tree_nodes_free(struct rb_root *root,
+static void tree_nodes_free(struct nf_conncount_root *root,
struct nf_conncount_rb *gc_nodes[],
unsigned int gc_count)
{
struct nf_conncount_rb *rbconn;
+ lockdep_assert_held(&root->lock);
+
while (gc_count) {
rbconn = gc_nodes[--gc_count];
spin_lock(&rbconn->list.list_lock);
if (!rbconn->list.count) {
- rb_erase(&rbconn->node, root);
+ rb_erase(&rbconn->node, &root->root);
call_rcu(&rbconn->rcu_head, __tree_nodes_free);
}
spin_unlock(&rbconn->list.list_lock);
@@ -396,10 +400,10 @@ insert_tree(struct net *net,
const struct sk_buff *skb,
u16 l3num,
struct nf_conncount_data *data,
- struct rb_root *root,
unsigned int hash,
const u32 *key)
{
+ struct nf_conncount_root *root = &data->root[hash];
struct nf_conncount_rb *gc_nodes[CONNCOUNT_GC_MAX_NODES];
const struct nf_conntrack_zone *zone = &nf_ct_zone_dflt;
bool do_gc = true, refcounted = false;
@@ -410,10 +414,10 @@ insert_tree(struct net *net,
struct nf_conncount_rb *rbconn;
struct nf_conn *ct = NULL;
- spin_lock_bh(&nf_conncount_locks[hash]);
+ spin_lock_bh(&root->lock);
restart:
parent = NULL;
- rbnode = &(root->rb_node);
+ rbnode = &root->root.rb_node;
while (*rbnode) {
int diff;
rbconn = rb_entry(*rbnode, struct nf_conncount_rb, node);
@@ -475,12 +479,12 @@ insert_tree(struct net *net,
rbconn->list.count = count;
rb_link_node_rcu(&rbconn->node, parent, rbnode);
- rb_insert_color(&rbconn->node, root);
+ rb_insert_color(&rbconn->node, &root->root);
}
out_unlock:
if (refcounted)
nf_ct_put(ct);
- spin_unlock_bh(&nf_conncount_locks[hash]);
+ spin_unlock_bh(&root->lock);
return count;
}
@@ -491,7 +495,7 @@ count_tree(struct net *net,
struct nf_conncount_data *data,
const u32 *key)
{
- struct rb_root *root;
+ struct nf_conncount_root *root;
struct rb_node *parent;
struct nf_conncount_rb *rbconn;
unsigned int hash;
@@ -499,7 +503,7 @@ count_tree(struct net *net,
hash = jhash2(key, data->keylen, data->initval) % CONNCOUNT_SLOTS;
root = &data->root[hash];
- parent = rcu_dereference(root->rb_node);
+ parent = rcu_dereference(root->root.rb_node);
while (parent) {
int diff;
@@ -544,14 +548,14 @@ count_tree(struct net *net,
if (!skb)
return 0;
- return insert_tree(net, skb, l3num, data, root, hash, key);
+ return insert_tree(net, skb, l3num, data, hash, key);
}
static void tree_gc_worker(struct work_struct *work)
{
struct nf_conncount_data *data = container_of(work, struct nf_conncount_data, gc_work);
struct nf_conncount_rb *gc_nodes[CONNCOUNT_GC_MAX_NODES], *rbconn;
- struct rb_root *root;
+ struct nf_conncount_root *root;
struct rb_node *node;
unsigned int tree, next_tree, gc_count = 0;
@@ -560,7 +564,7 @@ static void tree_gc_worker(struct work_struct *work)
local_bh_disable();
rcu_read_lock();
- for (node = rb_first(root); node != NULL; node = rb_next(node)) {
+ for (node = rb_first(&root->root); node ; node = rb_next(node)) {
rbconn = rb_entry(node, struct nf_conncount_rb, node);
if (nf_conncount_gc_list(data->net, &rbconn->list))
gc_count++;
@@ -570,12 +574,12 @@ static void tree_gc_worker(struct work_struct *work)
cond_resched();
- spin_lock_bh(&nf_conncount_locks[tree]);
+ spin_lock_bh(&root->lock);
if (gc_count < ARRAY_SIZE(gc_nodes))
goto next; /* do not bother */
gc_count = 0;
- node = rb_first(root);
+ node = rb_first(&root->root);
while (node != NULL) {
rbconn = rb_entry(node, struct nf_conncount_rb, node);
node = rb_next(node);
@@ -602,7 +606,7 @@ static void tree_gc_worker(struct work_struct *work)
schedule_work(work);
}
- spin_unlock_bh(&nf_conncount_locks[tree]);
+ spin_unlock_bh(&root->lock);
}
/* Count and return number of conntrack entries in 'net' with particular 'key'.
@@ -620,6 +624,12 @@ unsigned int nf_conncount_count_skb(struct net *net,
}
EXPORT_SYMBOL_GPL(nf_conncount_count_skb);
+static void nf_conncount_root_init(struct nf_conncount_root *r)
+{
+ r->root = RB_ROOT;
+ spin_lock_init(&r->lock);
+}
+
struct nf_conncount_data *nf_conncount_init(struct net *net, unsigned int keylen)
{
struct nf_conncount_data *data;
@@ -630,12 +640,12 @@ struct nf_conncount_data *nf_conncount_init(struct net *net, unsigned int keylen
keylen == 0)
return ERR_PTR(-EINVAL);
- data = kmalloc_obj(*data);
+ data = kvzalloc_obj(*data);
if (!data)
return ERR_PTR(-ENOMEM);
for (i = 0; i < ARRAY_SIZE(data->root); ++i)
- data->root[i] = RB_ROOT;
+ nf_conncount_root_init(&data->root[i]);
data->keylen = keylen / sizeof(u32);
data->net = net;
@@ -655,15 +665,15 @@ void nf_conncount_cache_free(struct nf_conncount_list *list)
}
EXPORT_SYMBOL_GPL(nf_conncount_cache_free);
-static void destroy_tree(struct rb_root *r)
+static void destroy_tree(struct nf_conncount_root *r)
{
struct nf_conncount_rb *rbconn;
struct rb_node *node;
- while ((node = rb_first(r)) != NULL) {
+ while ((node = rb_first(&r->root)) != NULL) {
rbconn = rb_entry(node, struct nf_conncount_rb, node);
- rb_erase(node, r);
+ rb_erase(node, &r->root);
nf_conncount_cache_free(&rbconn->list);
@@ -680,17 +690,12 @@ void nf_conncount_destroy(struct net *net, struct nf_conncount_data *data)
for (i = 0; i < ARRAY_SIZE(data->root); ++i)
destroy_tree(&data->root[i]);
- kfree(data);
+ kvfree(data);
}
EXPORT_SYMBOL_GPL(nf_conncount_destroy);
static int __init nf_conncount_modinit(void)
{
- int i;
-
- for (i = 0; i < CONNCOUNT_SLOTS; ++i)
- spin_lock_init(&nf_conncount_locks[i]);
-
conncount_conn_cachep = KMEM_CACHE(nf_conncount_tuple, 0);
if (!conncount_conn_cachep)
return -ENOMEM;
--
2.47.3
^ permalink raw reply related [flat|nested] 12+ messages in thread* [PATCH net-next 05/11] netfilter: nf_conncount: split count_tree_node rbtree walk into helper
2026-06-14 11:45 [PATCH net-next 00/11] Netfilter/IPVS updates for net-next Pablo Neira Ayuso
` (3 preceding siblings ...)
2026-06-14 11:45 ` [PATCH net-next 04/11] netfilter: nf_conncount: use per nf_conncount_data spinlocks Pablo Neira Ayuso
@ 2026-06-14 11:45 ` Pablo Neira Ayuso
2026-06-14 11:46 ` [PATCH net-next 06/11] netfilter: nf_conncount: add sequence counter to detect tree modifications Pablo Neira Ayuso
` (5 subsequent siblings)
10 siblings, 0 replies; 12+ messages in thread
From: Pablo Neira Ayuso @ 2026-06-14 11:45 UTC (permalink / raw)
To: netfilter-devel; +Cc: davem, netdev, kuba, pabeni, edumazet, fw, horms
From: Florian Westphal <fw@strlen.de>
Add find_tree_node() helper that fetches a matching rbtree node.
This is used by followup patch to optionally search the tree again while
preventing concurrent updates via tree lock.
Signed-off-by: Florian Westphal <fw@strlen.de>
Signed-off-by: Pablo Neira Ayuso <pablo@netfilter.org>
---
net/netfilter/nf_conncount.c | 102 +++++++++++++++++++++--------------
1 file changed, 62 insertions(+), 40 deletions(-)
diff --git a/net/netfilter/nf_conncount.c b/net/netfilter/nf_conncount.c
index faecc05d34d4..56ac64ecfb75 100644
--- a/net/netfilter/nf_conncount.c
+++ b/net/netfilter/nf_conncount.c
@@ -488,6 +488,34 @@ insert_tree(struct net *net,
return count;
}
+static struct nf_conncount_rb *
+find_tree_node(struct nf_conncount_root *root, struct nf_conncount_data *data,
+ const u32 *key)
+{
+ struct rb_node *parent;
+
+ parent = rcu_dereference_check(root->root.rb_node,
+ lockdep_is_held(&root->lock));
+ while (parent) {
+ struct nf_conncount_rb *rbconn;
+ int diff;
+
+ rbconn = rb_entry(parent, struct nf_conncount_rb, node);
+
+ diff = key_diff(key, rbconn->key, data->keylen);
+ if (diff < 0)
+ parent = rcu_dereference_check(parent->rb_left,
+ lockdep_is_held(&root->lock));
+ else if (diff > 0)
+ parent = rcu_dereference_check(parent->rb_right,
+ lockdep_is_held(&root->lock));
+ else
+ return rbconn;
+ }
+
+ return ERR_PTR(-ENOENT);
+}
+
static unsigned int
count_tree(struct net *net,
const struct sk_buff *skb,
@@ -496,59 +524,53 @@ count_tree(struct net *net,
const u32 *key)
{
struct nf_conncount_root *root;
- struct rb_node *parent;
struct nf_conncount_rb *rbconn;
unsigned int hash;
+ int ret;
hash = jhash2(key, data->keylen, data->initval) % CONNCOUNT_SLOTS;
root = &data->root[hash];
- parent = rcu_dereference(root->root.rb_node);
- while (parent) {
- int diff;
-
- rbconn = rb_entry(parent, struct nf_conncount_rb, node);
+ rbconn = find_tree_node(root, data, key);
+ if (IS_ERR(rbconn)) {
+ if (PTR_ERR(rbconn) == -ENOENT) {
+ if (!skb)
+ return 0;
- diff = key_diff(key, rbconn->key, data->keylen);
- if (diff < 0) {
- parent = rcu_dereference(parent->rb_left);
- } else if (diff > 0) {
- parent = rcu_dereference(parent->rb_right);
- } else {
- int ret;
+ return insert_tree(net, skb, l3num, data, hash, key);
+ }
+ DEBUG_NET_WARN_ON_ONCE(IS_ERR(rbconn));
+ }
- if (!skb) {
- nf_conncount_gc_list(net, &rbconn->list);
- return rbconn->list.count;
- }
+ DEBUG_NET_WARN_ON_ONCE(IS_ERR_OR_NULL(rbconn));
+ if (IS_ERR_OR_NULL(rbconn))
+ return 0;
- spin_lock_bh(&rbconn->list.list_lock);
- /* Node might be about to be free'd.
- * We need to defer to insert_tree() in this case.
- */
- if (rbconn->list.count == 0) {
- spin_unlock_bh(&rbconn->list.list_lock);
- break;
- }
+ if (!skb) {
+ nf_conncount_gc_list(net, &rbconn->list);
+ return rbconn->list.count;
+ }
- /* same source network -> be counted! */
- ret = __nf_conncount_add(net, skb, l3num, &rbconn->list);
- spin_unlock_bh(&rbconn->list.list_lock);
- if (ret && ret != -EEXIST) {
- return 0; /* hotdrop */
- } else {
- /* -EEXIST means add was skipped, update the list */
- if (ret == -EEXIST)
- nf_conncount_gc_list(net, &rbconn->list);
- return rbconn->list.count;
- }
- }
+ spin_lock_bh(&rbconn->list.list_lock);
+ /* Node might be about to be free'd.
+ * We need to defer to insert_tree() in this case.
+ */
+ if (rbconn->list.count == 0) {
+ spin_unlock_bh(&rbconn->list.list_lock);
+ return insert_tree(net, skb, l3num, data, hash, key);
}
- if (!skb)
- return 0;
+ /* same source network -> be counted! */
+ ret = __nf_conncount_add(net, skb, l3num, &rbconn->list);
+ spin_unlock_bh(&rbconn->list.list_lock);
+
+ if (ret && ret != -EEXIST)
+ return 0; /* hotdrop */
+ /* -EEXIST means add was skipped, update the list */
+ if (ret == -EEXIST)
+ nf_conncount_gc_list(net, &rbconn->list);
- return insert_tree(net, skb, l3num, data, hash, key);
+ return rbconn->list.count;
}
static void tree_gc_worker(struct work_struct *work)
--
2.47.3
^ permalink raw reply related [flat|nested] 12+ messages in thread* [PATCH net-next 06/11] netfilter: nf_conncount: add sequence counter to detect tree modifications
2026-06-14 11:45 [PATCH net-next 00/11] Netfilter/IPVS updates for net-next Pablo Neira Ayuso
` (4 preceding siblings ...)
2026-06-14 11:45 ` [PATCH net-next 05/11] netfilter: nf_conncount: split count_tree_node rbtree walk into helper Pablo Neira Ayuso
@ 2026-06-14 11:46 ` Pablo Neira Ayuso
2026-06-14 11:46 ` [PATCH net-next 07/11] netfilter: nf_conncount: gc and rcu fixes Pablo Neira Ayuso
` (4 subsequent siblings)
10 siblings, 0 replies; 12+ messages in thread
From: Pablo Neira Ayuso @ 2026-06-14 11:46 UTC (permalink / raw)
To: netfilter-devel; +Cc: davem, netdev, kuba, pabeni, edumazet, fw, horms
From: Florian Westphal <fw@strlen.de>
There a two issues with traversal:
1. Key lookup (tree search) cannot detect concurrent modifications and may
not find a result in case of parallel modification.
2. Worker does a lockless iteration. This is never safe.
Add a sequence counter and re-do the lookup under lock in case the
tree was modified / seqcount changed.
gc_worker bugs are addressed in the next patch.
Signed-off-by: Florian Westphal <fw@strlen.de>
Signed-off-by: Pablo Neira Ayuso <pablo@netfilter.org>
---
net/netfilter/nf_conncount.c | 19 +++++++++++++++++++
1 file changed, 19 insertions(+)
diff --git a/net/netfilter/nf_conncount.c b/net/netfilter/nf_conncount.c
index 56ac64ecfb75..1247cbe77740 100644
--- a/net/netfilter/nf_conncount.c
+++ b/net/netfilter/nf_conncount.c
@@ -57,6 +57,7 @@ struct nf_conncount_rb {
struct nf_conncount_root {
struct rb_root root;
spinlock_t lock;
+ seqcount_spinlock_t count;
};
struct nf_conncount_data {
@@ -382,8 +383,10 @@ static void tree_nodes_free(struct nf_conncount_root *root,
rbconn = gc_nodes[--gc_count];
spin_lock(&rbconn->list.list_lock);
if (!rbconn->list.count) {
+ write_seqcount_begin(&root->count);
rb_erase(&rbconn->node, &root->root);
call_rcu(&rbconn->rcu_head, __tree_nodes_free);
+ write_seqcount_end(&root->count);
}
spin_unlock(&rbconn->list.list_lock);
}
@@ -478,8 +481,10 @@ insert_tree(struct net *net,
count = 1;
rbconn->list.count = count;
+ write_seqcount_begin(&root->count);
rb_link_node_rcu(&rbconn->node, parent, rbnode);
rb_insert_color(&rbconn->node, &root->root);
+ write_seqcount_end(&root->count);
}
out_unlock:
if (refcounted)
@@ -492,6 +497,7 @@ static struct nf_conncount_rb *
find_tree_node(struct nf_conncount_root *root, struct nf_conncount_data *data,
const u32 *key)
{
+ unsigned int seq = read_seqcount_begin(&root->count);
struct rb_node *parent;
parent = rcu_dereference_check(root->root.rb_node,
@@ -511,8 +517,14 @@ find_tree_node(struct nf_conncount_root *root, struct nf_conncount_data *data,
lockdep_is_held(&root->lock));
else
return rbconn;
+
+ if (read_seqcount_retry(&root->count, seq))
+ return ERR_PTR(-EAGAIN);
}
+ if (read_seqcount_retry(&root->count, seq))
+ return ERR_PTR(-EAGAIN);
+
return ERR_PTR(-ENOENT);
}
@@ -533,6 +545,12 @@ count_tree(struct net *net,
rbconn = find_tree_node(root, data, key);
if (IS_ERR(rbconn)) {
+ if (PTR_ERR(rbconn) == -EAGAIN) {
+ spin_lock_bh(&root->lock);
+ rbconn = find_tree_node(root, data, key);
+ spin_unlock_bh(&root->lock);
+ }
+
if (PTR_ERR(rbconn) == -ENOENT) {
if (!skb)
return 0;
@@ -650,6 +668,7 @@ static void nf_conncount_root_init(struct nf_conncount_root *r)
{
r->root = RB_ROOT;
spin_lock_init(&r->lock);
+ seqcount_spinlock_init(&r->count, &r->lock);
}
struct nf_conncount_data *nf_conncount_init(struct net *net, unsigned int keylen)
--
2.47.3
^ permalink raw reply related [flat|nested] 12+ messages in thread* [PATCH net-next 07/11] netfilter: nf_conncount: gc and rcu fixes
2026-06-14 11:45 [PATCH net-next 00/11] Netfilter/IPVS updates for net-next Pablo Neira Ayuso
` (5 preceding siblings ...)
2026-06-14 11:46 ` [PATCH net-next 06/11] netfilter: nf_conncount: add sequence counter to detect tree modifications Pablo Neira Ayuso
@ 2026-06-14 11:46 ` Pablo Neira Ayuso
2026-06-14 11:46 ` [PATCH net-next 08/11] netfilter: conntrack: check NULL when retrieving ct extension Pablo Neira Ayuso
` (3 subsequent siblings)
10 siblings, 0 replies; 12+ messages in thread
From: Pablo Neira Ayuso @ 2026-06-14 11:46 UTC (permalink / raw)
To: netfilter-devel; +Cc: davem, netdev, kuba, pabeni, edumazet, fw, horms
From: Florian Westphal <fw@strlen.de>
Another drive-by AI review:
1) tree_gc_worker fails to wrap around after it can't find more pending
work. Update data->gc_tree unconditionally. If its 0, start from
the first pending tree (which can be 0).
2) tree_gc_worker() iterates the rbtree without lock. This is never
safe. Move iteration under the spinlock. If this takes too long
(resched needed), save key of next node, drop lock, resched, re-lock,
then search for the key (node). In very rare cases this node might
no longer exist, in that case we can just wait for next gc.
3) use disable_work_sync(), we don't want any restarts.
4) module exit function needs rcu_barrier before we zap the kmem cache.
Fixes: 5c789e131cbb ("netfilter: nf_conncount: Add list lock and gc worker, and RCU for init tree search")
Closes: https://sashiko.dev/#/patchset/20260525182924.28456-1-fw%40strlen.de
Assisted-by: Claude:claude-sonnet-4-6
Signed-off-by: Florian Westphal <fw@strlen.de>
Signed-off-by: Pablo Neira Ayuso <pablo@netfilter.org>
---
net/netfilter/nf_conncount.c | 54 +++++++++++++++++++++---------------
1 file changed, 32 insertions(+), 22 deletions(-)
diff --git a/net/netfilter/nf_conncount.c b/net/netfilter/nf_conncount.c
index 1247cbe77740..dd67004a5cc0 100644
--- a/net/netfilter/nf_conncount.c
+++ b/net/netfilter/nf_conncount.c
@@ -595,47 +595,54 @@ static void tree_gc_worker(struct work_struct *work)
{
struct nf_conncount_data *data = container_of(work, struct nf_conncount_data, gc_work);
struct nf_conncount_rb *gc_nodes[CONNCOUNT_GC_MAX_NODES], *rbconn;
+ unsigned int tree, next_tree, gc_count = 0;
struct nf_conncount_root *root;
struct rb_node *node;
- unsigned int tree, next_tree, gc_count = 0;
+
+ if (data->gc_tree == 0)
+ data->gc_tree = find_first_bit(data->pending_trees, CONNCOUNT_SLOTS);
tree = data->gc_tree % CONNCOUNT_SLOTS;
root = &data->root[tree];
- local_bh_disable();
- rcu_read_lock();
- for (node = rb_first(&root->root); node ; node = rb_next(node)) {
- rbconn = rb_entry(node, struct nf_conncount_rb, node);
- if (nf_conncount_gc_list(data->net, &rbconn->list))
- gc_count++;
- }
- rcu_read_unlock();
- local_bh_enable();
-
- cond_resched();
-
spin_lock_bh(&root->lock);
- if (gc_count < ARRAY_SIZE(gc_nodes))
- goto next; /* do not bother */
-
gc_count = 0;
node = rb_first(&root->root);
while (node != NULL) {
+ u32 key[MAX_KEYLEN];
+ bool drop_lock;
+
rbconn = rb_entry(node, struct nf_conncount_rb, node);
node = rb_next(node);
- if (rbconn->list.count > 0)
- continue;
+ if (nf_conncount_gc_list(data->net, &rbconn->list))
+ gc_nodes[gc_count++] = rbconn;
+
+ drop_lock = need_resched();
- gc_nodes[gc_count++] = rbconn;
- if (gc_count >= ARRAY_SIZE(gc_nodes)) {
+ if (drop_lock || gc_count >= ARRAY_SIZE(gc_nodes)) {
tree_nodes_free(root, gc_nodes, gc_count);
gc_count = 0;
}
+
+ if (!drop_lock || !node)
+ continue;
+
+ rbconn = rb_entry(node, struct nf_conncount_rb, node);
+ memcpy(key, rbconn->key, sizeof(key));
+ spin_unlock_bh(&root->lock);
+
+ cond_resched();
+
+ spin_lock_bh(&root->lock);
+ rbconn = find_tree_node(root, data, key);
+ if (IS_ERR_OR_NULL(rbconn)) /* rbconn was reaped */
+ break;
+
+ node = &rbconn->node;
}
tree_nodes_free(root, gc_nodes, gc_count);
-next:
clear_bit(tree, data->pending_trees);
next_tree = (tree + 1) % CONNCOUNT_SLOTS;
@@ -644,6 +651,8 @@ static void tree_gc_worker(struct work_struct *work)
if (next_tree < CONNCOUNT_SLOTS) {
data->gc_tree = next_tree;
schedule_work(work);
+ } else {
+ data->gc_tree = 0;
}
spin_unlock_bh(&root->lock);
@@ -726,7 +735,7 @@ void nf_conncount_destroy(struct net *net, struct nf_conncount_data *data)
{
unsigned int i;
- cancel_work_sync(&data->gc_work);
+ disable_work_sync(&data->gc_work);
for (i = 0; i < ARRAY_SIZE(data->root); ++i)
destroy_tree(&data->root[i]);
@@ -752,6 +761,7 @@ static int __init nf_conncount_modinit(void)
static void __exit nf_conncount_modexit(void)
{
+ rcu_barrier();
kmem_cache_destroy(conncount_conn_cachep);
kmem_cache_destroy(conncount_rb_cachep);
}
--
2.47.3
^ permalink raw reply related [flat|nested] 12+ messages in thread* [PATCH net-next 08/11] netfilter: conntrack: check NULL when retrieving ct extension
2026-06-14 11:45 [PATCH net-next 00/11] Netfilter/IPVS updates for net-next Pablo Neira Ayuso
` (6 preceding siblings ...)
2026-06-14 11:46 ` [PATCH net-next 07/11] netfilter: nf_conncount: gc and rcu fixes Pablo Neira Ayuso
@ 2026-06-14 11:46 ` Pablo Neira Ayuso
2026-06-14 11:46 ` [PATCH net-next 09/11] netfilter: flowtable: bail out if forward path cannot be discovered Pablo Neira Ayuso
` (2 subsequent siblings)
10 siblings, 0 replies; 12+ messages in thread
From: Pablo Neira Ayuso @ 2026-06-14 11:46 UTC (permalink / raw)
To: netfilter-devel; +Cc: davem, netdev, kuba, pabeni, edumazet, fw, horms
nf_ct_ext_find() might return NULL if ct extension is not found.
Add also the null checks to:
- nfct_help()
- nfct_help_data()
- nfct_seqadj()
- nfct_nat()
This is defensive, for safety reasons.
nf_ct_ext_find() used to return NULL if the extension is stale for
unconfirmed conntracks if the genid validation fails.
Skip NULL check in nf_nat_inet_fn() given this is valid to be NULL
for non-initialized ct nat extensions.
While at it, fetch ct helper area in nf_ct_expect_related_report() only
once and pass it on to other ancilliary functions. Replace WARN_ON()
by WARN_ON_ONCE() in nf_ct_unlink_expect_report().
Signed-off-by: Pablo Neira Ayuso <pablo@netfilter.org>
---
include/net/netfilter/nf_conntrack_helper.h | 2 +
net/ipv4/netfilter/nf_nat_h323.c | 12 ++++++
net/ipv4/netfilter/nf_nat_pptp.c | 14 +++++--
net/netfilter/nf_conntrack_broadcast.c | 3 ++
net/netfilter/nf_conntrack_expect.c | 33 +++++++++--------
net/netfilter/nf_conntrack_ftp.c | 6 +++
net/netfilter/nf_conntrack_h323_main.c | 18 +++++++++
net/netfilter/nf_conntrack_pptp.c | 9 +++++
net/netfilter/nf_conntrack_proto_gre.c | 9 +++++
net/netfilter/nf_conntrack_sane.c | 3 ++
net/netfilter/nf_conntrack_seqadj.c | 17 ++++++---
net/netfilter/nf_conntrack_sip.c | 41 ++++++++++++++++++++-
net/netfilter/nf_nat_sip.c | 12 ++++++
net/netfilter/nfnetlink_cthelper.c | 6 +++
14 files changed, 158 insertions(+), 27 deletions(-)
diff --git a/include/net/netfilter/nf_conntrack_helper.h b/include/net/netfilter/nf_conntrack_helper.h
index ed93a5a1adc8..93207de4f2c8 100644
--- a/include/net/netfilter/nf_conntrack_helper.h
+++ b/include/net/netfilter/nf_conntrack_helper.h
@@ -136,6 +136,8 @@ static inline void *nfct_help_data(const struct nf_conn *ct)
struct nf_conn_help *help;
help = nf_ct_ext_find(ct, NF_CT_EXT_HELPER);
+ if (!help)
+ return NULL;
return (void *)help->data;
}
diff --git a/net/ipv4/netfilter/nf_nat_h323.c b/net/ipv4/netfilter/nf_nat_h323.c
index faee20af4856..19dad54ada09 100644
--- a/net/ipv4/netfilter/nf_nat_h323.c
+++ b/net/ipv4/netfilter/nf_nat_h323.c
@@ -100,6 +100,9 @@ static int set_sig_addr(struct sk_buff *skb, struct nf_conn *ct,
__be16 port;
union nf_inet_addr addr;
+ if (!info)
+ return -1;
+
for (i = 0; i < count; i++) {
if (get_h225_addr(ct, *data, &taddr[i], &addr, &port)) {
if (addr.ip == ct->tuplehash[dir].tuple.src.u3.ip &&
@@ -184,6 +187,9 @@ static int nat_rtp_rtcp(struct sk_buff *skb, struct nf_conn *ct,
int i;
u_int16_t nated_port;
+ if (!info)
+ return -1;
+
/* Set expectations for NAT */
rtp_exp->saved_proto.udp.port = rtp_exp->tuple.dst.u.udp.port;
rtp_exp->expectfn = nf_nat_follow_master;
@@ -325,6 +331,9 @@ static int nat_h245(struct sk_buff *skb, struct nf_conn *ct,
int dir = CTINFO2DIR(ctinfo);
u_int16_t nated_port = ntohs(port);
+ if (!info)
+ return -1;
+
/* Set expectations for NAT */
exp->saved_proto.tcp.port = exp->tuple.dst.u.tcp.port;
exp->expectfn = nf_nat_follow_master;
@@ -404,6 +413,9 @@ static int nat_q931(struct sk_buff *skb, struct nf_conn *ct,
u_int16_t nated_port = ntohs(port);
union nf_inet_addr addr;
+ if (!info)
+ return -1;
+
/* Set expectations for NAT */
exp->saved_proto.tcp.port = exp->tuple.dst.u.tcp.port;
exp->expectfn = ip_nat_q931_expect;
diff --git a/net/ipv4/netfilter/nf_nat_pptp.c b/net/ipv4/netfilter/nf_nat_pptp.c
index fab357cc8559..fed5249001a4 100644
--- a/net/ipv4/netfilter/nf_nat_pptp.c
+++ b/net/ipv4/netfilter/nf_nat_pptp.c
@@ -53,11 +53,13 @@ static void pptp_nat_expected(struct nf_conn *ct,
struct nf_conn_nat *nat;
nat = nf_ct_nat_ext_add(ct);
- if (WARN_ON_ONCE(!nat))
+ if (!nat)
return;
nat_pptp_info = &nat->help.nat_pptp_info;
ct_pptp_info = nfct_help_data(master);
+ if (!ct_pptp_info)
+ return;
/* And here goes the grand finale of corrosion... */
if (exp->dir == IP_CT_DIR_ORIGINAL) {
@@ -132,11 +134,13 @@ pptp_outbound_pkt(struct sk_buff *skb,
__be16 new_callid;
unsigned int cid_off;
- if (WARN_ON_ONCE(!nat))
+ if (!nat)
return NF_DROP;
nat_pptp_info = &nat->help.nat_pptp_info;
ct_pptp_info = nfct_help_data(ct);
+ if (!ct_pptp_info)
+ return NF_DROP;
new_callid = ct_pptp_info->pns_call_id;
@@ -204,11 +208,13 @@ pptp_exp_gre(struct nf_conntrack_expect *expect_orig,
struct nf_ct_pptp_master *ct_pptp_info;
struct nf_nat_pptp *nat_pptp_info;
- if (WARN_ON_ONCE(!nat))
+ if (!nat)
return;
nat_pptp_info = &nat->help.nat_pptp_info;
ct_pptp_info = nfct_help_data(ct);
+ if (!ct_pptp_info)
+ return;
/* save original PAC call ID in nat_info */
nat_pptp_info->pac_call_id = ct_pptp_info->pac_call_id;
@@ -241,7 +247,7 @@ pptp_inbound_pkt(struct sk_buff *skb,
__be16 new_pcid;
unsigned int pcid_off;
- if (WARN_ON_ONCE(!nat))
+ if (!nat)
return NF_DROP;
nat_pptp_info = &nat->help.nat_pptp_info;
diff --git a/net/netfilter/nf_conntrack_broadcast.c b/net/netfilter/nf_conntrack_broadcast.c
index 75e53fde6b29..400119b6320e 100644
--- a/net/netfilter/nf_conntrack_broadcast.c
+++ b/net/netfilter/nf_conntrack_broadcast.c
@@ -29,6 +29,9 @@ int nf_conntrack_broadcast_help(struct sk_buff *skb,
struct nf_conn_help *help = nfct_help(ct);
__be32 mask = 0;
+ if (!help)
+ goto out;
+
/* we're only interested in locally generated packets */
if (skb->sk == NULL || !net_eq(nf_ct_net(ct), sock_net(skb->sk)))
goto out;
diff --git a/net/netfilter/nf_conntrack_expect.c b/net/netfilter/nf_conntrack_expect.c
index 8e943efbdf0a..5c9b17835c28 100644
--- a/net/netfilter/nf_conntrack_expect.c
+++ b/net/netfilter/nf_conntrack_expect.c
@@ -52,8 +52,7 @@ void nf_ct_unlink_expect_report(struct nf_conntrack_expect *exp,
struct nf_conntrack_net *cnet;
lockdep_nfct_expect_lock_held();
- WARN_ON(!master_help);
- WARN_ON(timer_pending(&exp->timeout));
+ WARN_ON_ONCE(timer_pending(&exp->timeout));
hlist_del_rcu(&exp->hnode);
@@ -61,7 +60,8 @@ void nf_ct_unlink_expect_report(struct nf_conntrack_expect *exp,
cnet->expect_count--;
hlist_del_rcu(&exp->lnode);
- master_help->expecting[exp->class]--;
+ if (master_help)
+ master_help->expecting[exp->class]--;
nf_ct_expect_event_report(IPEXP_DESTROY, exp, portid, report);
nf_ct_expect_put(exp);
@@ -405,10 +405,10 @@ void nf_ct_expect_put(struct nf_conntrack_expect *exp)
}
EXPORT_SYMBOL_GPL(nf_ct_expect_put);
-static void nf_ct_expect_insert(struct nf_conntrack_expect *exp)
+static void nf_ct_expect_insert(struct nf_conntrack_expect *exp,
+ struct nf_conn_help *master_help)
{
struct nf_conntrack_net *cnet;
- struct nf_conn_help *master_help = nfct_help(exp->master);
struct nf_conntrack_helper *helper;
struct net *net = nf_ct_exp_net(exp);
unsigned int h = nf_ct_expect_dst_hash(net, &exp->tuple);
@@ -436,10 +436,9 @@ static void nf_ct_expect_insert(struct nf_conntrack_expect *exp)
}
/* Race with expectations being used means we could have none to find; OK. */
-static void evict_oldest_expect(struct nf_conn *master,
+static void evict_oldest_expect(struct nf_conn_help *master_help,
struct nf_conntrack_expect *new)
{
- struct nf_conn_help *master_help = nfct_help(master);
struct nf_conntrack_expect *exp, *last = NULL;
hlist_for_each_entry(exp, &master_help->expectations, lnode) {
@@ -452,13 +451,12 @@ static void evict_oldest_expect(struct nf_conn *master,
}
static inline int __nf_ct_expect_check(struct nf_conntrack_expect *expect,
+ struct nf_conn_help *master_help,
unsigned int flags)
{
const struct nf_conntrack_expect_policy *p;
struct nf_conntrack_expect *i;
struct nf_conntrack_net *cnet;
- struct nf_conn *master = expect->master;
- struct nf_conn_help *master_help = nfct_help(master);
struct nf_conntrack_helper *helper;
struct net *net = nf_ct_exp_net(expect);
struct hlist_node *next;
@@ -467,10 +465,6 @@ static inline int __nf_ct_expect_check(struct nf_conntrack_expect *expect,
lockdep_nfct_expect_lock_held();
- if (!master_help) {
- ret = -ESHUTDOWN;
- goto out;
- }
h = nf_ct_expect_dst_hash(net, &expect->tuple);
hlist_for_each_entry_safe(i, next, &nf_ct_expect_hash[h], hnode) {
if (master_matches(i, expect, flags) &&
@@ -493,7 +487,7 @@ static inline int __nf_ct_expect_check(struct nf_conntrack_expect *expect,
p = &helper->expect_policy[expect->class];
if (p->max_expected &&
master_help->expecting[expect->class] >= p->max_expected) {
- evict_oldest_expect(master, expect);
+ evict_oldest_expect(master_help, expect);
if (master_help->expecting[expect->class]
>= p->max_expected) {
ret = -EMFILE;
@@ -514,14 +508,21 @@ static inline int __nf_ct_expect_check(struct nf_conntrack_expect *expect,
int nf_ct_expect_related_report(struct nf_conntrack_expect *expect,
u32 portid, int report, unsigned int flags)
{
+ struct nf_conn_help *master_help;
int ret;
spin_lock_bh(&nf_conntrack_expect_lock);
- ret = __nf_ct_expect_check(expect, flags);
+ master_help = nfct_help(expect->master);
+ if (!master_help) {
+ ret = -ESHUTDOWN;
+ goto out;
+ }
+
+ ret = __nf_ct_expect_check(expect, master_help, flags);
if (ret < 0)
goto out;
- nf_ct_expect_insert(expect);
+ nf_ct_expect_insert(expect, master_help);
nf_ct_expect_event_report(IPEXP_NEW, expect, portid, report);
spin_unlock_bh(&nf_conntrack_expect_lock);
diff --git a/net/netfilter/nf_conntrack_ftp.c b/net/netfilter/nf_conntrack_ftp.c
index c7777f37371a..0847f845613d 100644
--- a/net/netfilter/nf_conntrack_ftp.c
+++ b/net/netfilter/nf_conntrack_ftp.c
@@ -384,6 +384,9 @@ static int help(struct sk_buff *skb,
int found = 0, ends_in_nl;
nf_nat_ftp_hook_fn *nf_nat_ftp;
+ if (!ct_ftp_info)
+ return NF_DROP;
+
/* Until there's been traffic both ways, don't look in packets. */
if (ctinfo != IP_CT_ESTABLISHED &&
ctinfo != IP_CT_ESTABLISHED_REPLY) {
@@ -545,6 +548,9 @@ static int nf_ct_ftp_from_nlattr(struct nlattr *attr, struct nf_conn *ct)
{
struct nf_ct_ftp_master *ftp = nfct_help_data(ct);
+ if (!ftp)
+ return -ENOENT;
+
/* This conntrack has been injected from user-space, always pick up
* sequence tracking. Otherwise, the first FTP command after the
* failover breaks.
diff --git a/net/netfilter/nf_conntrack_h323_main.c b/net/netfilter/nf_conntrack_h323_main.c
index ebae9fdab897..7f189dceb3c4 100644
--- a/net/netfilter/nf_conntrack_h323_main.c
+++ b/net/netfilter/nf_conntrack_h323_main.c
@@ -76,6 +76,9 @@ static int get_tpkt_data(struct sk_buff *skb, unsigned int protoff,
int tpktlen;
int tpktoff;
+ if (!info)
+ return 0;
+
/* Get TCP header */
th = skb_header_pointer(skb, protoff, sizeof(_tcph), &_tcph);
if (th == NULL)
@@ -1191,6 +1194,9 @@ static int expect_q931(struct sk_buff *skb, struct nf_conn *ct,
union nf_inet_addr addr;
struct nf_conntrack_expect *exp;
+ if (!info)
+ return -1;
+
/* Look for the first related address */
for (i = 0; i < count; i++) {
if (get_h225_addr(ct, *data, &taddr[i], &addr, &port) &&
@@ -1307,6 +1313,9 @@ static int process_rrq(struct sk_buff *skb, struct nf_conn *ct,
const struct nfct_h323_nat_hooks *nathook;
int ret;
+ if (!info)
+ return -1;
+
pr_debug("nf_ct_ras: RRQ\n");
ret = expect_q931(skb, ct, ctinfo, protoff, data,
@@ -1345,6 +1354,9 @@ static int process_rcf(struct sk_buff *skb, struct nf_conn *ct,
int ret;
struct nf_conntrack_expect *exp;
+ if (!info)
+ return -1;
+
pr_debug("nf_ct_ras: RCF\n");
nathook = rcu_dereference(nfct_h323_nat_hook);
@@ -1395,6 +1407,9 @@ static int process_urq(struct sk_buff *skb, struct nf_conn *ct,
int dir = CTINFO2DIR(ctinfo);
int ret;
+ if (!info)
+ return -1;
+
pr_debug("nf_ct_ras: URQ\n");
nathook = rcu_dereference(nfct_h323_nat_hook);
@@ -1429,6 +1444,9 @@ static int process_arq(struct sk_buff *skb, struct nf_conn *ct,
__be16 port;
union nf_inet_addr addr;
+ if (!info)
+ return 0;
+
pr_debug("nf_ct_ras: ARQ\n");
nathook = rcu_dereference(nfct_h323_nat_hook);
diff --git a/net/netfilter/nf_conntrack_pptp.c b/net/netfilter/nf_conntrack_pptp.c
index ed567a1cf7fd..776505a78e64 100644
--- a/net/netfilter/nf_conntrack_pptp.c
+++ b/net/netfilter/nf_conntrack_pptp.c
@@ -198,6 +198,9 @@ pptp_inbound_pkt(struct sk_buff *skb, unsigned int protoff,
u_int16_t msg;
__be16 cid = 0, pcid = 0;
+ if (!info)
+ return NF_DROP;
+
msg = ntohs(ctlh->messageType);
pr_debug("inbound control message %s\n", pptp_msg_name(msg));
@@ -325,6 +328,9 @@ pptp_outbound_pkt(struct sk_buff *skb, unsigned int protoff,
u_int16_t msg;
__be16 cid = 0, pcid = 0;
+ if (!info)
+ return NF_DROP;
+
msg = ntohs(ctlh->messageType);
pr_debug("outbound control message %s\n", pptp_msg_name(msg));
@@ -443,6 +449,9 @@ conntrack_pptp_help(struct sk_buff *skb, unsigned int protoff,
int ret;
u_int16_t msg;
+ if (!info)
+ return NF_DROP;
+
#if IS_ENABLED(CONFIG_NF_NAT)
if (!nf_ct_is_confirmed(ct) && (ct->status & IPS_NAT_MASK)) {
struct nf_conn_nat *nat = nf_ct_ext_find(ct, NF_CT_EXT_NAT);
diff --git a/net/netfilter/nf_conntrack_proto_gre.c b/net/netfilter/nf_conntrack_proto_gre.c
index 473658259f1a..616ab1e2fc5e 100644
--- a/net/netfilter/nf_conntrack_proto_gre.c
+++ b/net/netfilter/nf_conntrack_proto_gre.c
@@ -131,6 +131,9 @@ bool nf_ct_gre_keymap_add(struct nf_conn *ct,
struct nf_ct_gre_keymap *km_orig, *km_repl;
bool ret = false;
+ if (!ct_pptp_info)
+ return false;
+
km_orig = kmalloc_obj(*km_orig, GFP_ATOMIC);
if (!km_orig)
return false;
@@ -187,6 +190,9 @@ void nf_ct_gre_keymap_destroy(struct nf_conn *ct)
struct nf_ct_pptp_master *ct_pptp_info = nfct_help_data(ct);
enum ip_conntrack_dir dir;
+ if (!ct_pptp_info)
+ return;
+
pr_debug("entering for ct %p\n", ct);
spin_lock_bh(&keymap_lock);
@@ -389,6 +395,9 @@ void gre_pptp_destroy_siblings(struct nf_conn *ct)
const struct nf_ct_pptp_master *ct_pptp_info = nfct_help_data(ct);
struct nf_conntrack_tuple t;
+ if (!ct_pptp_info)
+ return;
+
nf_ct_gre_keymap_destroy(ct);
/* try original (pns->pac) tuple */
diff --git a/net/netfilter/nf_conntrack_sane.c b/net/netfilter/nf_conntrack_sane.c
index a7f7b07ba0c2..39085acf7a71 100644
--- a/net/netfilter/nf_conntrack_sane.c
+++ b/net/netfilter/nf_conntrack_sane.c
@@ -74,6 +74,9 @@ static int help(struct sk_buff *skb,
struct sane_reply_net_start repl;
} buf;
+ if (!ct_sane_info)
+ return NF_DROP;
+
/* Until there's been traffic both ways, don't look in packets. */
if (ctinfo != IP_CT_ESTABLISHED &&
ctinfo != IP_CT_ESTABLISHED_REPLY)
diff --git a/net/netfilter/nf_conntrack_seqadj.c b/net/netfilter/nf_conntrack_seqadj.c
index b7e99f34dfce..220216a4edc5 100644
--- a/net/netfilter/nf_conntrack_seqadj.c
+++ b/net/netfilter/nf_conntrack_seqadj.c
@@ -18,9 +18,12 @@ int nf_ct_seqadj_init(struct nf_conn *ct, enum ip_conntrack_info ctinfo,
return 0;
spin_lock_bh(&ct->lock);
- set_bit(IPS_SEQ_ADJUST_BIT, &ct->status);
-
seqadj = nfct_seqadj(ct);
+ if (!seqadj) {
+ spin_unlock_bh(&ct->lock);
+ return 0;
+ }
+ set_bit(IPS_SEQ_ADJUST_BIT, &ct->status);
this_way = &seqadj->seq[dir];
this_way->offset_before = off;
this_way->offset_after = off;
@@ -39,10 +42,8 @@ int nf_ct_seqadj_set(struct nf_conn *ct, enum ip_conntrack_info ctinfo,
if (off == 0)
return 0;
- if (unlikely(!seqadj)) {
- WARN_ONCE(1, "Missing nfct_seqadj_ext_add() setup call\n");
+ if (unlikely(!seqadj))
return 0;
- }
set_bit(IPS_SEQ_ADJUST_BIT, &ct->status);
@@ -125,6 +126,9 @@ static unsigned int nf_ct_sack_adjust(struct sk_buff *skb,
struct nf_conn_seqadj *seqadj = nfct_seqadj(ct);
unsigned int dir, optoff, optend;
+ if (!seqadj)
+ return 0;
+
optoff = protoff + sizeof(struct tcphdr);
optend = protoff + tcph->doff * 4;
@@ -175,6 +179,9 @@ int nf_ct_seq_adjust(struct sk_buff *skb,
struct nf_ct_seqadj *this_way, *other_way;
int res = 1;
+ if (!seqadj)
+ return 0;
+
this_way = &seqadj->seq[dir];
other_way = &seqadj->seq[!dir];
diff --git a/net/netfilter/nf_conntrack_sip.c b/net/netfilter/nf_conntrack_sip.c
index 2c78a3e1dab5..c606d1f60b58 100644
--- a/net/netfilter/nf_conntrack_sip.c
+++ b/net/netfilter/nf_conntrack_sip.c
@@ -887,6 +887,9 @@ static int refresh_signalling_expectation(struct nf_conn *ct,
struct hlist_node *next;
int found = 0;
+ if (!help)
+ return 0;
+
spin_lock_bh(&nf_conntrack_expect_lock);
hlist_for_each_entry_safe(exp, next, &help->expectations, lnode) {
if (exp->class != SIP_EXPECT_SIGNALLING ||
@@ -910,6 +913,9 @@ static void flush_expectations(struct nf_conn *ct, bool media)
struct nf_conntrack_expect *exp;
struct hlist_node *next;
+ if (!help)
+ return;
+
spin_lock_bh(&nf_conntrack_expect_lock);
hlist_for_each_entry_safe(exp, next, &help->expectations, lnode) {
if ((exp->class != SIP_EXPECT_SIGNALLING) ^ media)
@@ -940,6 +946,11 @@ static int set_expected_rtp_rtcp(struct sk_buff *skb, unsigned int protoff,
u_int16_t base_port;
__be16 rtp_port, rtcp_port;
const struct nf_nat_sip_hooks *hooks;
+ struct nf_conn_help *help;
+
+ help = nfct_help(ct);
+ if (!help)
+ return NF_DROP;
saddr = NULL;
if (sip_direct_media) {
@@ -1002,7 +1013,7 @@ static int set_expected_rtp_rtcp(struct sk_buff *skb, unsigned int protoff,
exp = __nf_ct_expect_find(net, nf_ct_zone(ct), &tuple);
if (!exp || exp->master == ct ||
- exp->helper != nfct_help(ct)->helper ||
+ exp->helper != help->helper ||
exp->class != class)
break;
#if IS_ENABLED(CONFIG_NF_NAT)
@@ -1227,6 +1238,9 @@ static int process_invite_response(struct sk_buff *skb, unsigned int protoff,
struct nf_conn *ct = nf_ct_get(skb, &ctinfo);
struct nf_ct_sip_master *ct_sip_info = nfct_help_data(ct);
+ if (!ct_sip_info)
+ return NF_DROP;
+
if ((code >= 100 && code <= 199) ||
(code >= 200 && code <= 299))
return process_sdp(skb, protoff, dataoff, dptr, datalen, cseq);
@@ -1244,6 +1258,9 @@ static int process_update_response(struct sk_buff *skb, unsigned int protoff,
struct nf_conn *ct = nf_ct_get(skb, &ctinfo);
struct nf_ct_sip_master *ct_sip_info = nfct_help_data(ct);
+ if (!ct_sip_info)
+ return NF_DROP;
+
if ((code >= 100 && code <= 199) ||
(code >= 200 && code <= 299))
return process_sdp(skb, protoff, dataoff, dptr, datalen, cseq);
@@ -1261,6 +1278,9 @@ static int process_prack_response(struct sk_buff *skb, unsigned int protoff,
struct nf_conn *ct = nf_ct_get(skb, &ctinfo);
struct nf_ct_sip_master *ct_sip_info = nfct_help_data(ct);
+ if (!ct_sip_info)
+ return NF_DROP;
+
if ((code >= 100 && code <= 199) ||
(code >= 200 && code <= 299))
return process_sdp(skb, protoff, dataoff, dptr, datalen, cseq);
@@ -1279,6 +1299,9 @@ static int process_invite_request(struct sk_buff *skb, unsigned int protoff,
struct nf_ct_sip_master *ct_sip_info = nfct_help_data(ct);
unsigned int ret;
+ if (!ct_sip_info)
+ return NF_DROP;
+
flush_expectations(ct, true);
ret = process_sdp(skb, protoff, dataoff, dptr, datalen, cseq);
if (ret == NF_ACCEPT)
@@ -1316,11 +1339,15 @@ static int process_register_request(struct sk_buff *skb, unsigned int protoff,
union nf_inet_addr *saddr, daddr;
const struct nf_nat_sip_hooks *hooks;
struct nf_conntrack_helper *helper;
+ struct nf_conn_help *help;
__be16 port;
u8 proto;
unsigned int expires = 0;
int ret;
+ if (!ct_sip_info)
+ return NF_DROP;
+
/* Expected connections can not register again. */
if (ct->status & IPS_EXPECTED)
return NF_ACCEPT;
@@ -1366,7 +1393,11 @@ static int process_register_request(struct sk_buff *skb, unsigned int protoff,
goto store_cseq;
}
- helper = rcu_dereference(nfct_help(ct)->helper);
+ help = nfct_help(ct);
+ if (!help)
+ return NF_DROP;
+
+ helper = rcu_dereference(help->helper);
if (!helper)
return NF_DROP;
@@ -1421,6 +1452,9 @@ static int process_register_response(struct sk_buff *skb, unsigned int protoff,
unsigned int expires = 0;
int in_contact = 0, ret;
+ if (!ct_sip_info)
+ return NF_DROP;
+
/* According to RFC 3261, "UAs MUST NOT send a new registration until
* they have received a final response from the registrar for the
* previous one or the previous REGISTER request has timed out".
@@ -1550,6 +1584,9 @@ static int process_sip_request(struct sk_buff *skb, unsigned int protoff,
union nf_inet_addr addr;
__be16 port;
+ if (!ct_sip_info)
+ return NF_DROP;
+
/* Many Cisco IP phones use a high source port for SIP requests, but
* listen for the response on port 5060. If we are the local
* router for one of these phones, save the port number from the
diff --git a/net/netfilter/nf_nat_sip.c b/net/netfilter/nf_nat_sip.c
index 9fbfc6bff0c2..7f29a6785327 100644
--- a/net/netfilter/nf_nat_sip.c
+++ b/net/netfilter/nf_nat_sip.c
@@ -106,6 +106,9 @@ static int map_addr(struct sk_buff *skb, unsigned int protoff,
union nf_inet_addr newaddr;
__be16 newport;
+ if (!ct_sip_info)
+ return 0;
+
if (nf_inet_addr_cmp(&ct->tuplehash[dir].tuple.src.u3, addr) &&
ct->tuplehash[dir].tuple.src.u.udp.port == port) {
newaddr = ct->tuplehash[!dir].tuple.dst.u3;
@@ -158,6 +161,9 @@ static unsigned int nf_nat_sip(struct sk_buff *skb, unsigned int protoff,
__be16 port;
int request, in_header;
+ if (!ct_sip_info)
+ return NF_DROP;
+
/* Basic rules: requests and responses. */
if (strncasecmp(*dptr, "SIP/2.0", strlen("SIP/2.0")) != 0) {
if (ct_sip_parse_request(ct, *dptr, *datalen,
@@ -326,6 +332,9 @@ static void nf_nat_sip_expected(struct nf_conn *ct,
int range_set_for_snat = 0;
struct nf_nat_range2 range;
+ if (!help)
+ return;
+
/* This must be a fresh one. */
BUG_ON(ct->status & IPS_NAT_DONE_MASK);
@@ -390,6 +399,9 @@ static unsigned int nf_nat_sip_expect(struct sk_buff *skb, unsigned int protoff,
char buffer[INET6_ADDRSTRLEN + sizeof("[]:nnnnn")];
unsigned int buflen;
+ if (!ct_sip_info)
+ return NF_DROP;
+
/* Connection will come from reply */
if (nf_inet_addr_cmp(&ct->tuplehash[dir].tuple.src.u3,
&ct->tuplehash[!dir].tuple.dst.u3))
diff --git a/net/netfilter/nfnetlink_cthelper.c b/net/netfilter/nfnetlink_cthelper.c
index 033ea90c4401..f1460b683d7a 100644
--- a/net/netfilter/nfnetlink_cthelper.c
+++ b/net/netfilter/nfnetlink_cthelper.c
@@ -101,6 +101,9 @@ nfnl_cthelper_from_nlattr(struct nlattr *attr, struct nf_conn *ct)
struct nf_conn_help *help = nfct_help(ct);
const struct nf_conntrack_helper *helper;
+ if (!help)
+ return -EINVAL;
+
if (attr == NULL)
return -EINVAL;
@@ -118,6 +121,9 @@ nfnl_cthelper_to_nlattr(struct sk_buff *skb, const struct nf_conn *ct)
const struct nf_conn_help *help = nfct_help(ct);
const struct nf_conntrack_helper *helper;
+ if (!help)
+ return 0;
+
helper = rcu_dereference(help->helper);
if (helper && helper->data_len &&
nla_put(skb, CTA_HELP_INFO, helper->data_len, &help->data))
--
2.47.3
^ permalink raw reply related [flat|nested] 12+ messages in thread* [PATCH net-next 09/11] netfilter: flowtable: bail out if forward path cannot be discovered
2026-06-14 11:45 [PATCH net-next 00/11] Netfilter/IPVS updates for net-next Pablo Neira Ayuso
` (7 preceding siblings ...)
2026-06-14 11:46 ` [PATCH net-next 08/11] netfilter: conntrack: check NULL when retrieving ct extension Pablo Neira Ayuso
@ 2026-06-14 11:46 ` Pablo Neira Ayuso
2026-06-14 11:46 ` [PATCH net-next 10/11] ipvs: fix doc syntax for conn_max sysctl Pablo Neira Ayuso
2026-06-14 11:46 ` [PATCH net-next 11/11] netfilter: nf_dup_netdev: add nf_dev_xmit_recursion*() helpers and use them Pablo Neira Ayuso
10 siblings, 0 replies; 12+ messages in thread
From: Pablo Neira Ayuso @ 2026-06-14 11:46 UTC (permalink / raw)
To: netfilter-devel; +Cc: davem, netdev, kuba, pabeni, edumazet, fw, horms
If forward path discovery fails for any reason or netdevice is not
registered for this flowtable, then bail out to classic forwarding path
rather than providing incomplete forwarding path.
Update the existing forward path parser functions to report an error
so the flow_offload expressions gives up on setting up the flowtable
entry.
Link: https://sashiko.dev/#/patchset/20260607094954.48892-15-pablo%40netfilter.org?part=14
Signed-off-by: Pablo Neira Ayuso <pablo@netfilter.org>
---
net/netfilter/nf_flow_table_path.c | 81 +++++++++++++++++-------------
1 file changed, 46 insertions(+), 35 deletions(-)
diff --git a/net/netfilter/nf_flow_table_path.c b/net/netfilter/nf_flow_table_path.c
index a3e6b82f2f8e..1e7e216b9f89 100644
--- a/net/netfilter/nf_flow_table_path.c
+++ b/net/netfilter/nf_flow_table_path.c
@@ -90,9 +90,9 @@ struct nft_forward_info {
enum flow_offload_xmit_type xmit_type;
};
-static void nft_dev_path_info(const struct net_device_path_stack *stack,
- struct nft_forward_info *info,
- unsigned char *ha, struct nf_flowtable *flowtable)
+static int nft_dev_path_info(const struct net_device_path_stack *stack,
+ struct nft_forward_info *info,
+ unsigned char *ha, struct nf_flowtable *flowtable)
{
const struct net_device_path *path;
int i;
@@ -120,19 +120,17 @@ static void nft_dev_path_info(const struct net_device_path_stack *stack,
/* DEV_PATH_VLAN, DEV_PATH_PPPOE and DEV_PATH_TUN */
if (path->type == DEV_PATH_TUN) {
- if (info->num_tuns) {
- info->indev = NULL;
- break;
- }
+ if (info->num_tuns)
+ return -1;
+
info->tun.src_v6 = path->tun.src_v6;
info->tun.dst_v6 = path->tun.dst_v6;
info->tun.l3_proto = path->tun.l3_proto;
info->num_tuns++;
} else {
- if (info->num_encaps >= NF_FLOW_TABLE_ENCAP_MAX) {
- info->indev = NULL;
- break;
- }
+ if (info->num_encaps >= NF_FLOW_TABLE_ENCAP_MAX)
+ return -1;
+
info->encap[info->num_encaps].id =
path->encap.id;
info->encap[info->num_encaps].proto =
@@ -151,22 +149,23 @@ static void nft_dev_path_info(const struct net_device_path_stack *stack,
switch (path->bridge.vlan_mode) {
case DEV_PATH_BR_VLAN_UNTAG_HW:
+ if (info->num_encaps == 0)
+ return -1;
+
info->ingress_vlans |= BIT(info->num_encaps - 1);
break;
case DEV_PATH_BR_VLAN_TAG:
- if (info->num_encaps >= NF_FLOW_TABLE_ENCAP_MAX) {
- info->indev = NULL;
- break;
- }
+ if (info->num_encaps >= NF_FLOW_TABLE_ENCAP_MAX)
+ return -1;
+
info->encap[info->num_encaps].id = path->bridge.vlan_id;
info->encap[info->num_encaps].proto = path->bridge.vlan_proto;
info->num_encaps++;
break;
case DEV_PATH_BR_VLAN_UNTAG:
- if (info->num_encaps == 0) {
- info->indev = NULL;
- break;
- }
+ if (info->num_encaps == 0)
+ return -1;
+
info->num_encaps--;
break;
case DEV_PATH_BR_VLAN_KEEP:
@@ -175,8 +174,7 @@ static void nft_dev_path_info(const struct net_device_path_stack *stack,
info->xmit_type = FLOW_OFFLOAD_XMIT_DIRECT;
break;
default:
- info->indev = NULL;
- break;
+ return -1;
}
}
info->outdev = info->indev;
@@ -184,6 +182,8 @@ static void nft_dev_path_info(const struct net_device_path_stack *stack,
if (nf_flowtable_hw_offload(flowtable) &&
nft_is_valid_ether_device(info->indev))
info->xmit_type = FLOW_OFFLOAD_XMIT_DIRECT;
+
+ return 0;
}
static bool nft_flowtable_find_dev(const struct net_device *dev,
@@ -241,11 +241,11 @@ static int nft_flow_tunnel_update_route(const struct nft_pktinfo *pkt,
return 0;
}
-static void nft_dev_forward_path(const struct nft_pktinfo *pkt,
- struct nf_flow_route *route,
- const struct nf_conn *ct,
- enum ip_conntrack_dir dir,
- struct nft_flowtable *ft)
+static int nft_dev_forward_path(const struct nft_pktinfo *pkt,
+ struct nf_flow_route *route,
+ const struct nf_conn *ct,
+ enum ip_conntrack_dir dir,
+ struct nft_flowtable *ft)
{
const struct dst_entry *dst = route->tuple[dir].dst;
struct net_device_path_stack stack;
@@ -253,15 +253,16 @@ static void nft_dev_forward_path(const struct nft_pktinfo *pkt,
unsigned char ha[ETH_ALEN];
int i;
- if (nft_dev_fill_forward_path(route, dst, ct, dir, ha, &stack) >= 0)
- nft_dev_path_info(&stack, &info, ha, &ft->data);
+ if (nft_dev_fill_forward_path(route, dst, ct, dir, ha, &stack) < 0 ||
+ nft_dev_path_info(&stack, &info, ha, &ft->data) < 0)
+ return -ENOENT;
+
+ if (!nft_flowtable_find_dev(info.indev, ft))
+ return -ENOENT;
if (info.outdev)
route->tuple[dir].out.ifindex = info.outdev->ifindex;
- if (!info.indev || !nft_flowtable_find_dev(info.indev, ft))
- return;
-
route->tuple[!dir].in.ifindex = info.indev->ifindex;
for (i = 0; i < info.num_encaps; i++) {
route->tuple[!dir].in.encap[i].id = info.encap[i].id;
@@ -285,6 +286,8 @@ static void nft_dev_forward_path(const struct nft_pktinfo *pkt,
route->tuple[dir].xmit_type = info.xmit_type;
}
route->tuple[dir].out.needs_gso_segment = info.needs_gso_segment;
+
+ return 0;
}
int nft_flow_route(const struct nft_pktinfo *pkt, const struct nf_conn *ct,
@@ -329,11 +332,19 @@ int nft_flow_route(const struct nft_pktinfo *pkt, const struct nf_conn *ct,
nft_default_forward_path(route, this_dst, dir);
nft_default_forward_path(route, other_dst, !dir);
- if (route->tuple[dir].xmit_type == FLOW_OFFLOAD_XMIT_NEIGH)
- nft_dev_forward_path(pkt, route, ct, dir, ft);
- if (route->tuple[!dir].xmit_type == FLOW_OFFLOAD_XMIT_NEIGH)
- nft_dev_forward_path(pkt, route, ct, !dir, ft);
+ if (route->tuple[dir].xmit_type == FLOW_OFFLOAD_XMIT_NEIGH &&
+ nft_dev_forward_path(pkt, route, ct, dir, ft) < 0)
+ goto err_dst_release;
+
+ if (route->tuple[!dir].xmit_type == FLOW_OFFLOAD_XMIT_NEIGH &&
+ nft_dev_forward_path(pkt, route, ct, !dir, ft) < 0)
+ goto err_dst_release;
return 0;
+
+err_dst_release:
+ dst_release(route->tuple[dir].dst);
+ dst_release(route->tuple[!dir].dst);
+ return -ENOENT;
}
EXPORT_SYMBOL_GPL(nft_flow_route);
--
2.47.3
^ permalink raw reply related [flat|nested] 12+ messages in thread* [PATCH net-next 10/11] ipvs: fix doc syntax for conn_max sysctl
2026-06-14 11:45 [PATCH net-next 00/11] Netfilter/IPVS updates for net-next Pablo Neira Ayuso
` (8 preceding siblings ...)
2026-06-14 11:46 ` [PATCH net-next 09/11] netfilter: flowtable: bail out if forward path cannot be discovered Pablo Neira Ayuso
@ 2026-06-14 11:46 ` Pablo Neira Ayuso
2026-06-14 11:46 ` [PATCH net-next 11/11] netfilter: nf_dup_netdev: add nf_dev_xmit_recursion*() helpers and use them Pablo Neira Ayuso
10 siblings, 0 replies; 12+ messages in thread
From: Pablo Neira Ayuso @ 2026-06-14 11:46 UTC (permalink / raw)
To: netfilter-devel; +Cc: davem, netdev, kuba, pabeni, edumazet, fw, horms
From: Julian Anastasov <ja@ssi.bg>
Fix the docutils error reported by kernel test robot
for the new conn_max sysctl:
Documentation/networking/ipvs-sysctl.rst:76: WARNING: Block quote ends
without a blank line; unexpected unindent. [docutils]
Documentation/networking/ipvs-sysctl.rst:76: ERROR: Unexpected section
title or transition.
Reported-by: kernel test robot <lkp@intel.com>
Closes: https://lore.kernel.org/oe-kbuild-all/202606071851.Dc1H7hOO-lkp@intel.com/
Fixes: 4a15044a2b06 ("ipvs: add conn_max sysctl to limit connections")
Signed-off-by: Julian Anastasov <ja@ssi.bg>
Signed-off-by: Pablo Neira Ayuso <pablo@netfilter.org>
---
Documentation/networking/ipvs-sysctl.rst | 23 ++++++++++++++++-------
1 file changed, 16 insertions(+), 7 deletions(-)
diff --git a/Documentation/networking/ipvs-sysctl.rst b/Documentation/networking/ipvs-sysctl.rst
index b6bac2612420..fe36f4fcd3a0 100644
--- a/Documentation/networking/ipvs-sysctl.rst
+++ b/Documentation/networking/ipvs-sysctl.rst
@@ -72,20 +72,29 @@ conn_max - INTEGER
Netfilter connection tracking) the connections can be
limited also by nf_conntrack_max.
- soft limit hard limit
- =====================================================
- init_net:
+ Limits for init_net:
+
+ ======================= =============== =============
+ \ soft limit hard limit
+ ======================= =============== =============
create netns platform platform
priv admin 0 .. platform 0 .. platform
- =====================================================
- new netns:
+ ======================= =============== =============
+
+ Limits for new netns:
+
+ ======================= =============== =============
+ \ soft limit hard limit
+ ======================= =============== =============
create netns init_net:soft init_net:soft
priv admin 0 .. platform 0 .. platform
unpriv admin 0 .. hard N/A
+ ======================= =============== =============
Limits per platform:
- 1,073,741,824 (2^30 for 64-bit)
- 16,777,216 (2^24 for 32-bit)
+
+ - 1,073,741,824 (2^30 for 64-bit)
+ - 16,777,216 (2^24 for 32-bit)
Possible values: 0 .. platform limit
--
2.47.3
^ permalink raw reply related [flat|nested] 12+ messages in thread* [PATCH net-next 11/11] netfilter: nf_dup_netdev: add nf_dev_xmit_recursion*() helpers and use them
2026-06-14 11:45 [PATCH net-next 00/11] Netfilter/IPVS updates for net-next Pablo Neira Ayuso
` (9 preceding siblings ...)
2026-06-14 11:46 ` [PATCH net-next 10/11] ipvs: fix doc syntax for conn_max sysctl Pablo Neira Ayuso
@ 2026-06-14 11:46 ` Pablo Neira Ayuso
10 siblings, 0 replies; 12+ messages in thread
From: Pablo Neira Ayuso @ 2026-06-14 11:46 UTC (permalink / raw)
To: netfilter-devel; +Cc: davem, netdev, kuba, pabeni, edumazet, fw, horms
Update nft_dup and nft_fwd to use the nf_dev_xmit_recursion() helpers.
This patch also disables BH when transmitting the skb to address a
possible migration to different CPU leading to imbalanced decrementation
of the recursion counters.
This is modeled after Florian Westphal's dev_xmit_recursion*() API
available since commit 97cdcf37b57e ("net: place xmit recursion in
softnet data") according to its current state in the tree.
Fixes: 1d47b55b36d2 ("netfilter: nft_fwd_netdev: use recursion counter in neigh egress path")
Fixes: f37ad9127039 ("netfilter: nf_dup_netdev: Move the recursion counter struct netdev_xmit")
Signed-off-by: Pablo Neira Ayuso <pablo@netfilter.org>
---
include/net/netfilter/nf_dup_netdev.h | 34 +++++++++++++++++++++++----
net/netfilter/nf_dup_netdev.c | 15 ++++++------
net/netfilter/nft_fwd_netdev.c | 17 ++++++++------
3 files changed, 47 insertions(+), 19 deletions(-)
diff --git a/include/net/netfilter/nf_dup_netdev.h b/include/net/netfilter/nf_dup_netdev.h
index 609bcf422a9b..f6b05bd80c3f 100644
--- a/include/net/netfilter/nf_dup_netdev.h
+++ b/include/net/netfilter/nf_dup_netdev.h
@@ -11,15 +11,39 @@ void nf_fwd_netdev_egress(const struct nft_pktinfo *pkt, int oif);
#define NF_RECURSION_LIMIT 2
-static inline u8 *nf_get_nf_dup_skb_recursion(void)
-{
#ifndef CONFIG_PREEMPT_RT
- return this_cpu_ptr(&softnet_data.xmit.nf_dup_skb_recursion);
+static inline bool nf_dev_xmit_recursion(void)
+{
+ return unlikely(__this_cpu_read(softnet_data.xmit.nf_dup_skb_recursion) >
+ NF_RECURSION_LIMIT);
+}
+
+static inline void nf_dev_xmit_recursion_inc(void)
+{
+ __this_cpu_inc(softnet_data.xmit.nf_dup_skb_recursion);
+}
+
+static inline void nf_dev_xmit_recursion_dec(void)
+{
+ __this_cpu_dec(softnet_data.xmit.nf_dup_skb_recursion);
+}
#else
- return ¤t->net_xmit.nf_dup_skb_recursion;
-#endif
+static inline bool nf_dev_xmit_recursion(void)
+{
+ return unlikely(current->net_xmit.nf_dup_skb_recursion > NF_RECURSION_LIMIT);
+}
+
+static inline void nf_dev_xmit_recursion_inc(void)
+{
+ current->net_xmit.nf_dup_skb_recursion++;
}
+static inline void nf_dev_xmit_recursion_dec(void)
+{
+ current->net_xmit.nf_dup_skb_recursion--;
+}
+#endif
+
struct nft_offload_ctx;
struct nft_flow_rule;
diff --git a/net/netfilter/nf_dup_netdev.c b/net/netfilter/nf_dup_netdev.c
index 3b0a70e154cd..c189716e986a 100644
--- a/net/netfilter/nf_dup_netdev.c
+++ b/net/netfilter/nf_dup_netdev.c
@@ -16,11 +16,6 @@
static void nf_do_netdev_egress(struct sk_buff *skb, struct net_device *dev,
enum nf_dev_hooks hook)
{
- u8 *nf_dup_skb_recursion = nf_get_nf_dup_skb_recursion();
-
- if (*nf_dup_skb_recursion > NF_RECURSION_LIMIT)
- goto err;
-
if (hook == NF_NETDEV_INGRESS && skb_mac_header_was_set(skb)) {
if (skb_cow_head(skb, skb->mac_len))
goto err;
@@ -30,9 +25,15 @@ static void nf_do_netdev_egress(struct sk_buff *skb, struct net_device *dev,
skb->dev = dev;
skb_clear_tstamp(skb);
- (*nf_dup_skb_recursion)++;
+ local_bh_disable();
+ if (nf_dev_xmit_recursion()) {
+ local_bh_enable();
+ goto err;
+ }
+ nf_dev_xmit_recursion_inc();
dev_queue_xmit(skb);
- (*nf_dup_skb_recursion)--;
+ nf_dev_xmit_recursion_dec();
+ local_bh_enable();
return;
err:
kfree_skb(skb);
diff --git a/net/netfilter/nft_fwd_netdev.c b/net/netfilter/nft_fwd_netdev.c
index b9e88d7cf308..a48c2f765bba 100644
--- a/net/netfilter/nft_fwd_netdev.c
+++ b/net/netfilter/nft_fwd_netdev.c
@@ -95,7 +95,6 @@ static void nft_fwd_neigh_eval(const struct nft_expr *expr,
struct nft_regs *regs,
const struct nft_pktinfo *pkt)
{
- u8 *nf_dup_skb_recursion = nf_get_nf_dup_skb_recursion();
struct nft_fwd_neigh *priv = nft_expr_priv(expr);
void *addr = ®s->data[priv->sreg_addr];
int oif = regs->data[priv->sreg_dev];
@@ -154,13 +153,15 @@ static void nft_fwd_neigh_eval(const struct nft_expr *expr,
goto out;
}
- if (*nf_dup_skb_recursion > NF_RECURSION_LIMIT) {
+ dev = dev_get_by_index_rcu(nft_net(pkt), oif);
+ if (!dev) {
verdict = NF_DROP;
goto out;
}
- dev = dev_get_by_index_rcu(nft_net(pkt), oif);
- if (dev == NULL) {
+ local_bh_disable();
+ if (nf_dev_xmit_recursion()) {
+ local_bh_enable();
verdict = NF_DROP;
goto out;
}
@@ -169,16 +170,18 @@ static void nft_fwd_neigh_eval(const struct nft_expr *expr,
if (unlikely(skb_headroom(skb) < hh_len && dev->header_ops)) {
skb = skb_expand_head(skb, hh_len);
if (!skb) {
- verdict = NF_STOLEN;
+ local_bh_enable();
goto out;
}
}
skb->dev = dev;
skb_clear_tstamp(skb);
- (*nf_dup_skb_recursion)++;
+
+ nf_dev_xmit_recursion_inc();
neigh_xmit(neigh_table, dev, addr, skb);
- (*nf_dup_skb_recursion)--;
+ nf_dev_xmit_recursion_dec();
+ local_bh_enable();
out:
regs->verdict.code = verdict;
}
--
2.47.3
^ permalink raw reply related [flat|nested] 12+ messages in thread