* [PATCH net 1/9] netfilter: nf_conntrack_expect: zero at allocation time
2026-06-30 4:52 [PATCH net 0/9] netfilter: updates for net Florian Westphal
@ 2026-06-30 4:52 ` Florian Westphal
2026-06-30 4:52 ` [PATCH net 2/9] netfilter: nft_set_pipapo: don't leak bad clone into future transaction Florian Westphal
` (7 subsequent siblings)
8 siblings, 0 replies; 10+ messages in thread
From: Florian Westphal @ 2026-06-30 4:52 UTC (permalink / raw)
To: netdev
Cc: Paolo Abeni, David S. Miller, Eric Dumazet, Jakub Kicinski,
netfilter-devel, pablo
There are occasional LLM hints wrt. leaking uninitialized data to
userspace via ctnetlink. Just zero at allocation time,
expectations are not frequently used these days.
Intentionally keeps _init as-is because we could theoretically
support re-init, so add the missing exp->dir there.
Signed-off-by: Florian Westphal <fw@strlen.de>
---
net/netfilter/nf_conntrack_expect.c | 3 ++-
net/netfilter/nf_conntrack_netlink.c | 11 +----------
2 files changed, 3 insertions(+), 11 deletions(-)
diff --git a/net/netfilter/nf_conntrack_expect.c b/net/netfilter/nf_conntrack_expect.c
index 38630c5e006f..7ae68d60586a 100644
--- a/net/netfilter/nf_conntrack_expect.c
+++ b/net/netfilter/nf_conntrack_expect.c
@@ -306,7 +306,7 @@ struct nf_conntrack_expect *nf_ct_expect_alloc(struct nf_conn *me)
{
struct nf_conntrack_expect *new;
- new = kmem_cache_alloc(nf_ct_expect_cachep, GFP_ATOMIC);
+ new = kmem_cache_zalloc(nf_ct_expect_cachep, GFP_ATOMIC);
if (!new)
return NULL;
@@ -391,6 +391,7 @@ void nf_ct_expect_init(struct nf_conntrack_expect *exp, unsigned int class,
#if IS_ENABLED(CONFIG_NF_NAT)
memset(&exp->saved_addr, 0, sizeof(exp->saved_addr));
memset(&exp->saved_proto, 0, sizeof(exp->saved_proto));
+ exp->dir = 0;
#endif
}
EXPORT_SYMBOL_GPL(nf_ct_expect_init);
diff --git a/net/netfilter/nf_conntrack_netlink.c b/net/netfilter/nf_conntrack_netlink.c
index 4217715d42dc..31cbb1b55b9e 100644
--- a/net/netfilter/nf_conntrack_netlink.c
+++ b/net/netfilter/nf_conntrack_netlink.c
@@ -3549,8 +3549,6 @@ ctnetlink_alloc_expect(const struct nlattr * const cda[], struct nf_conn *ct,
if (cda[CTA_EXPECT_FLAGS]) {
exp->flags = ntohl(nla_get_be32(cda[CTA_EXPECT_FLAGS]));
exp->flags &= ~NF_CT_EXPECT_USERSPACE;
- } else {
- exp->flags = 0;
}
if (cda[CTA_EXPECT_FN]) {
const char *name = nla_data(cda[CTA_EXPECT_FN]);
@@ -3562,8 +3560,7 @@ ctnetlink_alloc_expect(const struct nlattr * const cda[], struct nf_conn *ct,
goto err_out;
}
exp->expectfn = expfn->expectfn;
- } else
- exp->expectfn = NULL;
+ }
exp->class = class;
exp->master = ct;
@@ -3583,12 +3580,6 @@ ctnetlink_alloc_expect(const struct nlattr * const cda[], struct nf_conn *ct,
exp, nf_ct_l3num(ct));
if (err < 0)
goto err_out;
-#if IS_ENABLED(CONFIG_NF_NAT)
- } else {
- memset(&exp->saved_addr, 0, sizeof(exp->saved_addr));
- memset(&exp->saved_proto, 0, sizeof(exp->saved_proto));
- exp->dir = 0;
-#endif
}
return exp;
err_out:
--
2.53.0
^ permalink raw reply related [flat|nested] 10+ messages in thread* [PATCH net 2/9] netfilter: nft_set_pipapo: don't leak bad clone into future transaction
2026-06-30 4:52 [PATCH net 0/9] netfilter: updates for net Florian Westphal
2026-06-30 4:52 ` [PATCH net 1/9] netfilter: nf_conntrack_expect: zero at allocation time Florian Westphal
@ 2026-06-30 4:52 ` Florian Westphal
2026-06-30 4:52 ` [PATCH net 3/9] netfilter: ipset: fix race between dump and ip_set_list resize Florian Westphal
` (6 subsequent siblings)
8 siblings, 0 replies; 10+ messages in thread
From: Florian Westphal @ 2026-06-30 4:52 UTC (permalink / raw)
To: netdev
Cc: Paolo Abeni, David S. Miller, Eric Dumazet, Jakub Kicinski,
netfilter-devel, pablo
On memory allocation failure the cloned nft_pipapo_match can enter a bad
state:
- some fields can have their lookup tables resized while others did
not
- bits might have been toggled
- scratch map can be undersized which also means m->bsize_max can be
lower than what is required
This means that the next insertion in the same batch can trigger
out-of-bounds writes.
Furthermore, a failure in the first can result in the bad clone to
leak into the next transaction because the abort callback is never
executed in this case (the upper layer saw an error and no attempt to
allocate a transactional request was made).
Record a state for the nft_pipapo_match structure:
- NEW (pristine clone)
- MOD (modified clone with good state)
- ERR (potentially bogus content)
Then make it so that deletes and insertions fail when the clone
entered ERR state.
In case the very first insert attempt results in an error, free the
clone right away.
Fixes: 3c4287f62044 ("nf_tables: Add set type for arbitrary concatenation of ranges")
Cc: stable@vger.kernel.org
Reported-and-tested-by: Seesee <cjc000013@gmail.com>
Reviewed-by: Stefano Brivio <sbrivio@redhat.com>
Signed-off-by: Florian Westphal <fw@strlen.de>
---
net/netfilter/nft_set_pipapo.c | 34 +++++++++++++++++++++++++++++-----
net/netfilter/nft_set_pipapo.h | 8 ++++++++
2 files changed, 37 insertions(+), 5 deletions(-)
diff --git a/net/netfilter/nft_set_pipapo.c b/net/netfilter/nft_set_pipapo.c
index 706c78853f24..978bb0c01106 100644
--- a/net/netfilter/nft_set_pipapo.c
+++ b/net/netfilter/nft_set_pipapo.c
@@ -342,6 +342,8 @@
#include "nft_set_pipapo_avx2.h"
#include "nft_set_pipapo.h"
+static void nft_pipapo_abort(const struct nft_set *set);
+
/**
* pipapo_refill() - For each set bit, set bits from selected mapping table item
* @map: Bitmap to be scanned for set bits
@@ -1296,7 +1298,7 @@ static int nft_pipapo_insert(const struct net *net, const struct nft_set *set,
const u8 *start_p, *end_p;
int i, bsize_max, err = 0;
- if (!m)
+ if (!m || m->state == NFT_PIPAPO_CLONE_ERR)
return -ENOMEM;
if (nft_set_ext_exists(ext, NFT_SET_EXT_KEY_END))
@@ -1367,8 +1369,10 @@ static int nft_pipapo_insert(const struct net *net, const struct nft_set *set,
else
ret = pipapo_expand(f, start, end, f->groups * f->bb);
- if (ret < 0)
- return ret;
+ if (ret < 0) {
+ err = ret;
+ goto abort;
+ }
if (f->bsize > bsize_max)
bsize_max = f->bsize;
@@ -1384,7 +1388,7 @@ static int nft_pipapo_insert(const struct net *net, const struct nft_set *set,
err = pipapo_realloc_scratch(m, bsize_max);
if (err)
- return err;
+ goto abort;
m->bsize_max = bsize_max;
} else {
@@ -1396,7 +1400,26 @@ static int nft_pipapo_insert(const struct net *net, const struct nft_set *set,
pipapo_map(m, rulemap, e);
+ m->state = NFT_PIPAPO_CLONE_MOD;
return 0;
+abort:
+ DEBUG_NET_WARN_ON_ONCE(m->state == NFT_PIPAPO_CLONE_ERR);
+
+ /* Two rollback cases:
+ * 1) no previous changes. nft_pipapo_abort is not
+ * guaranteed to be invoked (there might be no further
+ * add/delete requests coming after this).
+ *
+ * 2) we had previous changes: there are transaction
+ * records pointing to this set. Leave the rollback to
+ * the transaction handling.
+ */
+ if (m->state == NFT_PIPAPO_CLONE_NEW)
+ nft_pipapo_abort(set); /* releases m */
+ else
+ m->state = NFT_PIPAPO_CLONE_ERR;
+
+ return err;
}
/**
@@ -1473,6 +1496,7 @@ static struct nft_pipapo_match *pipapo_clone(struct nft_pipapo_match *old)
dst++;
}
+ new->state = NFT_PIPAPO_CLONE_NEW;
return new;
out_mt:
@@ -1896,7 +1920,7 @@ nft_pipapo_deactivate(const struct net *net, const struct nft_set *set,
/* removal must occur on priv->clone, if we are low on memory
* we have no choice and must fail the removal request.
*/
- if (!m)
+ if (!m || m->state == NFT_PIPAPO_CLONE_ERR)
return NULL;
e = pipapo_get(m, (const u8 *)elem->key.val.data,
diff --git a/net/netfilter/nft_set_pipapo.h b/net/netfilter/nft_set_pipapo.h
index b82abb03576e..a19e980d06ef 100644
--- a/net/netfilter/nft_set_pipapo.h
+++ b/net/netfilter/nft_set_pipapo.h
@@ -131,9 +131,16 @@ struct nft_pipapo_scratch {
unsigned long __map[];
};
+enum nft_pipapo_clone_state {
+ NFT_PIPAPO_CLONE_NEW,
+ NFT_PIPAPO_CLONE_MOD,
+ NFT_PIPAPO_CLONE_ERR,
+};
+
/**
* struct nft_pipapo_match - Data used for lookup and matching
* @field_count: Amount of fields in set
+ * @state: add/delete state; used from control plane
* @bsize_max: Maximum lookup table bucket size of all fields, in longs
* @scratch: Preallocated per-CPU maps for partial matching results
* @rcu: Matching data is swapped on commits
@@ -141,6 +148,7 @@ struct nft_pipapo_scratch {
*/
struct nft_pipapo_match {
u8 field_count;
+ enum nft_pipapo_clone_state state:8;
unsigned int bsize_max;
struct nft_pipapo_scratch * __percpu *scratch;
struct rcu_head rcu;
--
2.53.0
^ permalink raw reply related [flat|nested] 10+ messages in thread* [PATCH net 3/9] netfilter: ipset: fix race between dump and ip_set_list resize
2026-06-30 4:52 [PATCH net 0/9] netfilter: updates for net Florian Westphal
2026-06-30 4:52 ` [PATCH net 1/9] netfilter: nf_conntrack_expect: zero at allocation time Florian Westphal
2026-06-30 4:52 ` [PATCH net 2/9] netfilter: nft_set_pipapo: don't leak bad clone into future transaction Florian Westphal
@ 2026-06-30 4:52 ` Florian Westphal
2026-06-30 4:52 ` [PATCH net 4/9] netfilter: nf_conntrack_sip: validate skb_dst() before accessing it Florian Westphal
` (5 subsequent siblings)
8 siblings, 0 replies; 10+ messages in thread
From: Florian Westphal @ 2026-06-30 4:52 UTC (permalink / raw)
To: netdev
Cc: Paolo Abeni, David S. Miller, Eric Dumazet, Jakub Kicinski,
netfilter-devel, pablo
From: Xiang Mei <xmei5@asu.edu>
The release path of ip_set_dump_do() and ip_set_dump_done() read
inst->ip_set_list via ip_set_ref_netlink(), a plain rcu_dereference_raw()
of the array pointer. These run from netlink_recvmsg() without the nfnl
mutex and without an RCU read-side critical section.
A concurrent ip_set_create() can grow the array: it publishes the new
array, calls synchronize_net() and then kvfree()s the old one. Since the
dump paths read the array outside any RCU reader, synchronize_net() does
not wait for them and the old array can be freed while they still index
into it, causing a use-after-free.
The dumped set itself stays pinned via set->ref_netlink, so only the
array load needs protecting. Take rcu_read_lock() around it, matching
ip_set_get_byname() and __ip_set_put_byindex().
BUG: KASAN: slab-use-after-free in ip_set_dump_do (net/netfilter/ipset/ip_set_core.c:1697)
Read of size 8 at addr ffff88800b5c4018 by task exploit/150
Call Trace:
...
kasan_report (mm/kasan/report.c:595)
ip_set_dump_do (net/netfilter/ipset/ip_set_core.c:1697)
netlink_dump (net/netlink/af_netlink.c:2325)
netlink_recvmsg (net/netlink/af_netlink.c:1976)
sock_recvmsg (net/socket.c:1159)
__sys_recvfrom (net/socket.c:2315)
...
Oops: general protection fault, probably for non-canonical address ... KASAN NOPTI
KASAN: maybe wild-memory-access in range [0x02d6...d0-0x02d6...d7]
RIP: 0010:ip_set_dump_do (net/netfilter/ipset/ip_set_core.c:1698)
Kernel panic - not syncing: Fatal exception
Fixes: 8a02bdd50b2e ("netfilter: ipset: Fix calling ip_set() macro at dumping")
Cc: stable@vger.kernel.org
Reported-by: Weiming Shi <bestswngs@gmail.com>
Assisted-by: Claude:claude-opus-4-8
Signed-off-by: Xiang Mei <xmei5@asu.edu>
Acked-by: Jozsef Kadlecsik <kadlec@netfilter.org>
Signed-off-by: Florian Westphal <fw@strlen.de>
---
net/netfilter/ipset/ip_set_core.c | 8 +++++++-
1 file changed, 7 insertions(+), 1 deletion(-)
diff --git a/net/netfilter/ipset/ip_set_core.c b/net/netfilter/ipset/ip_set_core.c
index a531b654b8d9..6cfad152d7d1 100644
--- a/net/netfilter/ipset/ip_set_core.c
+++ b/net/netfilter/ipset/ip_set_core.c
@@ -1480,7 +1480,11 @@ ip_set_dump_done(struct netlink_callback *cb)
struct ip_set_net *inst =
(struct ip_set_net *)cb->args[IPSET_CB_NET];
ip_set_id_t index = (ip_set_id_t)cb->args[IPSET_CB_INDEX];
- struct ip_set *set = ip_set_ref_netlink(inst, index);
+ struct ip_set *set;
+
+ rcu_read_lock();
+ set = ip_set_ref_netlink(inst, index);
+ rcu_read_unlock();
if (set->variant->uref)
set->variant->uref(set, cb, false);
@@ -1686,7 +1690,9 @@ ip_set_dump_do(struct sk_buff *skb, struct netlink_callback *cb)
release_refcount:
/* If there was an error or set is done, release set */
if (ret || !cb->args[IPSET_CB_ARG0]) {
+ rcu_read_lock();
set = ip_set_ref_netlink(inst, index);
+ rcu_read_unlock();
if (set->variant->uref)
set->variant->uref(set, cb, false);
pr_debug("release set %s\n", set->name);
--
2.53.0
^ permalink raw reply related [flat|nested] 10+ messages in thread* [PATCH net 4/9] netfilter: nf_conntrack_sip: validate skb_dst() before accessing it
2026-06-30 4:52 [PATCH net 0/9] netfilter: updates for net Florian Westphal
` (2 preceding siblings ...)
2026-06-30 4:52 ` [PATCH net 3/9] netfilter: ipset: fix race between dump and ip_set_list resize Florian Westphal
@ 2026-06-30 4:52 ` Florian Westphal
2026-06-30 4:52 ` [PATCH net 5/9] netfilter: nfnetlink_cthelper: cap to maximum number of expectation per master Florian Westphal
` (4 subsequent siblings)
8 siblings, 0 replies; 10+ messages in thread
From: Florian Westphal @ 2026-06-30 4:52 UTC (permalink / raw)
To: netdev
Cc: Paolo Abeni, David S. Miller, Eric Dumazet, Jakub Kicinski,
netfilter-devel, pablo
From: Pablo Neira Ayuso <pablo@netfilter.org>
tc ingress and openvswitch do not guarantee routing information to be
available. These subsystems use the conntrack helper infrastructure, and
the SIP helper relies on the skb_dst() to be present if
sip_external_media is set to 1 (which is disabled by default as a module
parameter).
This effectively disables the sip_external_media toggle for these
subsystems without resulting in a crash.
Fixes: cae3a2627520 ("openvswitch: Allow attaching helpers to ct action")
Fixes: b57dc7c13ea9 ("net/sched: Introduce action ct")
Cc: stable@vger.kernel.org
Reported-by: Ren Wei <n05ec@lzu.edu.cn>
Signed-off-by: Pablo Neira Ayuso <pablo@netfilter.org>
Signed-off-by: Florian Westphal <fw@strlen.de>
---
net/netfilter/nf_conntrack_sip.c | 7 +++++--
1 file changed, 5 insertions(+), 2 deletions(-)
diff --git a/net/netfilter/nf_conntrack_sip.c b/net/netfilter/nf_conntrack_sip.c
index 5ec3a4a4bbd7..f3f90a866338 100644
--- a/net/netfilter/nf_conntrack_sip.c
+++ b/net/netfilter/nf_conntrack_sip.c
@@ -956,7 +956,6 @@ static int set_expected_rtp_rtcp(struct sk_buff *skb, unsigned int protoff,
return NF_ACCEPT;
saddr = &ct->tuplehash[!dir].tuple.src.u3;
} else if (sip_external_media) {
- struct net_device *dev = skb_dst(skb)->dev;
struct dst_entry *dst = NULL;
struct flowi fl;
@@ -978,7 +977,11 @@ static int set_expected_rtp_rtcp(struct sk_buff *skb, unsigned int protoff,
* through the same interface as the signalling peer.
*/
if (dst) {
- bool external_media = (dst->dev == dev);
+ const struct dst_entry *this_dst = skb_dst(skb);
+ bool external_media = false;
+
+ if (this_dst && dst->dev == this_dst->dev)
+ external_media = true;
dst_release(dst);
if (external_media)
--
2.53.0
^ permalink raw reply related [flat|nested] 10+ messages in thread* [PATCH net 5/9] netfilter: nfnetlink_cthelper: cap to maximum number of expectation per master
2026-06-30 4:52 [PATCH net 0/9] netfilter: updates for net Florian Westphal
` (3 preceding siblings ...)
2026-06-30 4:52 ` [PATCH net 4/9] netfilter: nf_conntrack_sip: validate skb_dst() before accessing it Florian Westphal
@ 2026-06-30 4:52 ` Florian Westphal
2026-06-30 4:52 ` [PATCH net 6/9] netfilter: nft_fib: reject fib expression on the netdev egress hook Florian Westphal
` (3 subsequent siblings)
8 siblings, 0 replies; 10+ messages in thread
From: Florian Westphal @ 2026-06-30 4:52 UTC (permalink / raw)
To: netdev
Cc: Paolo Abeni, David S. Miller, Eric Dumazet, Jakub Kicinski,
netfilter-devel, pablo
From: Pablo Neira Ayuso <pablo@netfilter.org>
If userspace helper policy updates sets maximum number of expectation to
zero, cap it to NF_CT_EXPECT_MAX_CNT (255) on updates too.
Fixes: 397c8300972f ("netfilter: nf_conntrack_helper: cap maximum number of expectation at helper registration")
Signed-off-by: Pablo Neira Ayuso <pablo@netfilter.org>
Signed-off-by: Florian Westphal <fw@strlen.de>
---
net/netfilter/nfnetlink_cthelper.c | 2 ++
1 file changed, 2 insertions(+)
diff --git a/net/netfilter/nfnetlink_cthelper.c b/net/netfilter/nfnetlink_cthelper.c
index f1460b683d7a..2cbcca9110db 100644
--- a/net/netfilter/nfnetlink_cthelper.c
+++ b/net/netfilter/nfnetlink_cthelper.c
@@ -163,6 +163,8 @@ nfnl_cthelper_expect_policy(struct nf_conntrack_expect_policy *expect_policy,
tb[NFCTH_POLICY_NAME], NF_CT_HELPER_NAME_LEN);
expect_policy->max_expected =
ntohl(nla_get_be32(tb[NFCTH_POLICY_EXPECT_MAX]));
+ if (!expect_policy->max_expected)
+ expect_policy->max_expected = NF_CT_EXPECT_MAX_CNT;
if (expect_policy->max_expected > NF_CT_EXPECT_MAX_CNT)
return -EINVAL;
--
2.53.0
^ permalink raw reply related [flat|nested] 10+ messages in thread* [PATCH net 6/9] netfilter: nft_fib: reject fib expression on the netdev egress hook
2026-06-30 4:52 [PATCH net 0/9] netfilter: updates for net Florian Westphal
` (4 preceding siblings ...)
2026-06-30 4:52 ` [PATCH net 5/9] netfilter: nfnetlink_cthelper: cap to maximum number of expectation per master Florian Westphal
@ 2026-06-30 4:52 ` Florian Westphal
2026-06-30 4:52 ` [PATCH net 7/9] netfilter: nfnetlink_queue: restrict writes to network header Florian Westphal
` (2 subsequent siblings)
8 siblings, 0 replies; 10+ messages in thread
From: Florian Westphal @ 2026-06-30 4:52 UTC (permalink / raw)
To: netdev
Cc: Paolo Abeni, David S. Miller, Eric Dumazet, Jakub Kicinski,
netfilter-devel, pablo
From: Theodor Arsenij Larionov-Trichkine <theodorlarionov@gmail.com>
A fib expression in a netdev egress base chain dereferences nft_in(pkt),
NULL on the transmit path, causing a NULL pointer dereference at eval.
nft_fib_validate() masks the hook with NF_INET_* values, but netdev hook
numbers are a separate enum that aliases them (NF_NETDEV_EGRESS ==
NF_INET_LOCAL_IN), so an egress chain passes validation and then faults.
Add nft_fib_netdev_validate() that limits each result/flag to the netdev
hook where the device it reads exists: the input-device cases (OIF,
OIFNAME, ADDRTYPE with F_IIF) to ingress, the output-device case (ADDRTYPE
with F_OIF) to egress, ADDRTYPE with no device flag to both. Also restrict
nft_fib_validate() to NFPROTO_IPV4/IPV6/INET so its NF_INET_* masks are
not applied to another family's hooks.
Fixes: 42df6e1d221d ("netfilter: Introduce egress hook")
Cc: stable@vger.kernel.org
Link: https://lore.kernel.org/netfilter-devel/ajxsjcDOnwllMfoR@strlen.de/
Signed-off-by: Theodor Arsenij Larionov-Trichkine <theodorlarionov@gmail.com>
Signed-off-by: Florian Westphal <fw@strlen.de>
---
net/netfilter/nft_fib.c | 9 +++++++++
net/netfilter/nft_fib_netdev.c | 29 ++++++++++++++++++++++++++++-
2 files changed, 37 insertions(+), 1 deletion(-)
diff --git a/net/netfilter/nft_fib.c b/net/netfilter/nft_fib.c
index e048f05694cd..89555380f1c5 100644
--- a/net/netfilter/nft_fib.c
+++ b/net/netfilter/nft_fib.c
@@ -31,6 +31,15 @@ int nft_fib_validate(const struct nft_ctx *ctx, const struct nft_expr *expr)
const struct nft_fib *priv = nft_expr_priv(expr);
unsigned int hooks;
+ switch (ctx->family) {
+ case NFPROTO_IPV4:
+ case NFPROTO_IPV6:
+ case NFPROTO_INET:
+ break;
+ default:
+ return -EOPNOTSUPP;
+ }
+
switch (priv->result) {
case NFT_FIB_RESULT_OIF:
case NFT_FIB_RESULT_OIFNAME:
diff --git a/net/netfilter/nft_fib_netdev.c b/net/netfilter/nft_fib_netdev.c
index 3f3478abd845..5774a7544027 100644
--- a/net/netfilter/nft_fib_netdev.c
+++ b/net/netfilter/nft_fib_netdev.c
@@ -50,6 +50,33 @@ static void nft_fib_netdev_eval(const struct nft_expr *expr,
regs->verdict.code = NFT_BREAK;
}
+static int nft_fib_netdev_validate(const struct nft_ctx *ctx,
+ const struct nft_expr *expr)
+{
+ const struct nft_fib *priv = nft_expr_priv(expr);
+ unsigned int hooks;
+
+ switch (priv->result) {
+ case NFT_FIB_RESULT_OIF:
+ case NFT_FIB_RESULT_OIFNAME:
+ hooks = (1 << NF_NETDEV_INGRESS);
+ break;
+ case NFT_FIB_RESULT_ADDRTYPE:
+ if (priv->flags & NFTA_FIB_F_IIF)
+ hooks = (1 << NF_NETDEV_INGRESS);
+ else if (priv->flags & NFTA_FIB_F_OIF)
+ hooks = (1 << NF_NETDEV_EGRESS);
+ else
+ hooks = (1 << NF_NETDEV_INGRESS) |
+ (1 << NF_NETDEV_EGRESS);
+ break;
+ default:
+ return -EINVAL;
+ }
+
+ return nft_chain_validate_hooks(ctx->chain, hooks);
+}
+
static struct nft_expr_type nft_fib_netdev_type;
static const struct nft_expr_ops nft_fib_netdev_ops = {
.type = &nft_fib_netdev_type,
@@ -57,7 +84,7 @@ static const struct nft_expr_ops nft_fib_netdev_ops = {
.eval = nft_fib_netdev_eval,
.init = nft_fib_init,
.dump = nft_fib_dump,
- .validate = nft_fib_validate,
+ .validate = nft_fib_netdev_validate,
};
static struct nft_expr_type nft_fib_netdev_type __read_mostly = {
--
2.53.0
^ permalink raw reply related [flat|nested] 10+ messages in thread* [PATCH net 7/9] netfilter: nfnetlink_queue: restrict writes to network header
2026-06-30 4:52 [PATCH net 0/9] netfilter: updates for net Florian Westphal
` (5 preceding siblings ...)
2026-06-30 4:52 ` [PATCH net 6/9] netfilter: nft_fib: reject fib expression on the netdev egress hook Florian Westphal
@ 2026-06-30 4:52 ` Florian Westphal
2026-06-30 4:52 ` [PATCH net 8/9] netfilter: nftables: restrict linklayer and network header writes Florian Westphal
2026-06-30 4:52 ` [PATCH net 9/9] netfilter: nftables: restrict checkum update offset Florian Westphal
8 siblings, 0 replies; 10+ messages in thread
From: Florian Westphal @ 2026-06-30 4:52 UTC (permalink / raw)
To: netdev
Cc: Paolo Abeni, David S. Miller, Eric Dumazet, Jakub Kicinski,
netfilter-devel, pablo
nfnetlink_queue doesn't allow selective replacements of some part of the
payload, only complete replacement.
If the new data is shorter, skb is trimmed, otherwise expanded.
Add minimal validation of the new ip/ipv6 header. Check total len
matches skb length. Disallow ip option modifications.
IPv6 extension headers are also disabled.
IP options and exthdrs could be allowed later after validation pass or
ip option recompile.
Transport header is not checked.
Bridge modifications are rejected. Given userspace doesn't even receive
L2 headers, use is limited and I don't think there are any users of
bridge nfnetlink_queue, let alone users that modifiy payload.
Arp isn't supported at all.
Signed-off-by: Florian Westphal <fw@strlen.de>
---
net/netfilter/nfnetlink_queue.c | 170 ++++++++++++++++++++++++++++++++
1 file changed, 170 insertions(+)
diff --git a/net/netfilter/nfnetlink_queue.c b/net/netfilter/nfnetlink_queue.c
index 80ca077b81bd..35d4c6c628ff 100644
--- a/net/netfilter/nfnetlink_queue.c
+++ b/net/netfilter/nfnetlink_queue.c
@@ -1184,6 +1184,173 @@ nfqnl_enqueue_packet(struct nf_queue_entry *entry, unsigned int queuenum)
return err;
}
+static bool nfqnl_validate_ipopts(const struct iphdr *iph_new,
+ const struct nf_queue_entry *e)
+{
+ const struct iphdr *iph_orig = ip_hdr(e->skb);
+ unsigned int ihl = iph_new->ihl * 4;
+
+ if (iph_new->ihl != iph_orig->ihl)
+ return false;
+ if (ihl == sizeof(*iph_orig))
+ return true;
+
+ return memcmp(iph_new + 1, ip_hdr(e->skb) + 1, ihl - sizeof(*iph_orig)) == 0;
+}
+
+static bool nfqnl_validate_ip4(const struct iphdr *iph, unsigned int data_len,
+ const struct nf_queue_entry *e)
+{
+ unsigned int ihl;
+
+ if (data_len < sizeof(*iph))
+ return false;
+
+ ihl = iph->ihl * 4u;
+ if (ihl < sizeof(*iph) || data_len < ihl)
+ return false;
+
+ if (iph->version != 4 ||
+ ((iph->frag_off ^ ip_hdr(e->skb)->frag_off) & ~htons(IP_DF)) != 0)
+ return false;
+
+ /* BIG TCP won't work; netlink attr len is u16 */
+ if (ntohs(iph->tot_len) != data_len)
+ return false;
+
+ /* support for ipopts mangling would require
+ * recompile + skb transport header update.
+ */
+ return nfqnl_validate_ipopts(iph, e);
+}
+
+static bool nfqnl_validate_one_exthdr(const u8 *data,
+ unsigned int data_len,
+ const struct nf_queue_entry *e,
+ int start, int hdrlen)
+{
+ u16 octets;
+
+ if (data_len < hdrlen || hdrlen < 2)
+ return false;
+
+ while (hdrlen > 0) {
+ if (data_len < sizeof(octets))
+ return false;
+ data_len -= sizeof(octets);
+
+ if (skb_copy_bits(e->skb, start, &octets, sizeof(octets)))
+ return false;
+
+ if (hdrlen < sizeof(octets))
+ return false;
+
+ hdrlen -= sizeof(octets);
+ if (memcmp(data, &octets, sizeof(octets)))
+ return false;
+
+ start += sizeof(octets);
+ data += sizeof(octets);
+ }
+
+ return true;
+}
+
+static bool nfqnl_validate_exthdr(const struct ipv6hdr *ip6_new,
+ unsigned int data_len,
+ const struct nf_queue_entry *e)
+{
+ const struct ipv6hdr *ip6_orig = ipv6_hdr(e->skb);
+ int exthdr_cnt = 0, start = sizeof(*ip6_orig);
+ const u8 *data = (const u8 *)ip6_new;
+ u8 orig_nexthdr = ip6_orig->nexthdr;
+ u8 new_nexthdr = ip6_new->nexthdr;
+
+ if (new_nexthdr != orig_nexthdr)
+ return false;
+
+ data += sizeof(*ip6_new);
+ data_len -= sizeof(*ip6_new);
+
+ while (ipv6_ext_hdr(orig_nexthdr)) {
+ const struct ipv6_opt_hdr *hp;
+ struct ipv6_opt_hdr _hdr;
+ int hdrlen;
+
+ if (orig_nexthdr == NEXTHDR_NONE)
+ return true;
+
+ if (unlikely(exthdr_cnt++ >= IP6_MAX_EXT_HDRS_CNT))
+ return false;
+
+ hp = skb_header_pointer(e->skb, start, sizeof(_hdr), &_hdr);
+ if (!hp)
+ return false;
+
+ switch (orig_nexthdr) {
+ case NEXTHDR_FRAGMENT:
+ hdrlen = sizeof(struct frag_hdr);
+ break;
+ case NEXTHDR_AUTH:
+ hdrlen = ipv6_authlen(hp);
+ break;
+ default:
+ hdrlen = ipv6_optlen(hp);
+ break;
+ }
+
+ if (!nfqnl_validate_one_exthdr(data, data_len, e,
+ start, hdrlen))
+ return false;
+
+ orig_nexthdr = hp->nexthdr;
+ hp = (const void *)data;
+ new_nexthdr = hp->nexthdr;
+
+ if (new_nexthdr != orig_nexthdr)
+ return false;
+
+ data_len -= hdrlen;
+ start += hdrlen;
+ data += hdrlen;
+ }
+
+ return true;
+}
+
+static bool nfqnl_validate_ip6(const struct ipv6hdr *ip6, unsigned int data_len,
+ const struct nf_queue_entry *e)
+{
+ if (data_len < sizeof(*ip6))
+ return false;
+
+ /* BIG TCP/jumbograms won't work; netlink attr len is u16 */
+ if (ntohs(ip6->payload_len) != data_len - sizeof(*ip6))
+ return false;
+
+ if (ip6->version != 6)
+ return false;
+
+ return nfqnl_validate_exthdr(ip6, data_len, e);
+}
+
+static bool nfqnl_validate_write(const void *data, unsigned int data_len,
+ const struct nf_queue_entry *e)
+{
+ switch (e->state.pf) {
+ case NFPROTO_IPV4:
+ return nfqnl_validate_ip4(data, data_len, e);
+ case NFPROTO_IPV6:
+ return nfqnl_validate_ip6(data, data_len, e) &&
+ !(IP6CB(e->skb)->flags & IP6SKB_JUMBOGRAM);
+ case NFPROTO_BRIDGE:
+ /* No write support. Bridge is dubious: userspace doesn't even see L2 header */
+ return false;
+ }
+
+ return false;
+}
+
static int
nfqnl_mangle(void *data, unsigned int data_len, struct nf_queue_entry *e, int diff)
{
@@ -1192,6 +1359,9 @@ nfqnl_mangle(void *data, unsigned int data_len, struct nf_queue_entry *e, int di
if (e->state.net->user_ns != &init_user_ns)
return -EPERM;
+ if (!nfqnl_validate_write(data, data_len, e))
+ return -EINVAL;
+
if (diff < 0) {
unsigned int min_len = skb_transport_offset(e->skb);
--
2.53.0
^ permalink raw reply related [flat|nested] 10+ messages in thread* [PATCH net 8/9] netfilter: nftables: restrict linklayer and network header writes
2026-06-30 4:52 [PATCH net 0/9] netfilter: updates for net Florian Westphal
` (6 preceding siblings ...)
2026-06-30 4:52 ` [PATCH net 7/9] netfilter: nfnetlink_queue: restrict writes to network header Florian Westphal
@ 2026-06-30 4:52 ` Florian Westphal
2026-06-30 4:52 ` [PATCH net 9/9] netfilter: nftables: restrict checkum update offset Florian Westphal
8 siblings, 0 replies; 10+ messages in thread
From: Florian Westphal @ 2026-06-30 4:52 UTC (permalink / raw)
To: netdev
Cc: Paolo Abeni, David S. Miller, Eric Dumazet, Jakub Kicinski,
netfilter-devel, pablo
Don't permit arbitrary writes to linklayer and network header data.
Several spots in network stack trust header validation performed in
ipv4/ipv6 before PRE_ROUTING hook.
For linklayer, allow writes for netdev ingress. For other hooks, only
allow link layer writes that do not spill into network header.
For network header, check the offset/length combinations:
- changing dscp requires store at offset 0 for checsum fixups, so
make sure ip version + length field isn't altered.
- ip6 dscp starts directly after the version field, so make sure it
remains 6.
Several of these checks could already be done at rule insertion time.
Risk is that this might cause ruleset load failures for existing
rulesets. With this change such writes are silently skipped and packet
passes unchanged.
Transport and inner header bases are not checked / restricted.
Signed-off-by: Florian Westphal <fw@strlen.de>
---
net/netfilter/nft_payload.c | 170 ++++++++++++++++++++++++++++++++++++
1 file changed, 170 insertions(+)
diff --git a/net/netfilter/nft_payload.c b/net/netfilter/nft_payload.c
index 345eff140d56..9c974df59b42 100644
--- a/net/netfilter/nft_payload.c
+++ b/net/netfilter/nft_payload.c
@@ -834,6 +834,172 @@ nft_payload_set_vlan(const u32 *src, struct sk_buff *skb, u16 offset, u8 len,
return true;
}
+/* Ingress is very early, before l3 protocol handlers.
+ * There should be no in-tree code that trusts l3/l4 headers
+ * between ingress and NF_INET_PRE_ROUTING hooks.
+ */
+static bool nft_in_ingress(const struct nf_hook_state *s)
+{
+ return s->pf == NFPROTO_NETDEV && s->hook == NF_NETDEV_INGRESS;
+}
+
+static bool nft_nh_write_ok_ip4(const struct nft_pktinfo *pkt,
+ const struct nft_payload_set *priv,
+ const u32 *src)
+{
+ unsigned int offset = priv->offset + skb_network_offset(pkt->skb);
+ const u8 *new_octets = (const u8 *)src;
+ u8 old_octet;
+
+ switch (priv->offset) {
+ case 0: /* csum fixups does expand dscp/tos store to 2 bytes.
+ * make sure ihl/version remain unchanged.
+ */
+ if (skb_copy_bits(pkt->skb, offset, &old_octet, sizeof(old_octet)))
+ return false;
+
+ return priv->len == 2 &&
+ *new_octets == old_octet;
+ case offsetof(struct iphdr, tos):
+ return priv->len == 1;
+ case offsetof(struct iphdr, id):
+ return priv->len == 2;
+ case offsetof(struct iphdr, ttl):
+ if (priv->len == 1)
+ return true;
+
+ if (priv->len != 2)
+ return false;
+
+ /* same, csum fixup does expand ttl store to two bytes.
+ * check protocol is not altered.
+ */
+ if (skb_copy_bits(pkt->skb, offset + 1, &old_octet, sizeof(old_octet)))
+ return false;
+
+ return new_octets[1] == old_octet;
+ case offsetof(struct iphdr, check):
+ return priv->len <= 2 + 4 + 4;
+ case offsetof(struct iphdr, saddr):
+ return priv->len <= 4 + 4;
+ case offsetof(struct iphdr, daddr):
+ return priv->len <= 4;
+ }
+
+ return false;
+}
+
+static bool nft_nh_write_ok_ip6(const struct nft_pktinfo *pkt,
+ const struct nft_payload_set *priv,
+ const u32 *src)
+{
+ const struct ipv6hdr *ih = (const void *)src;
+
+ switch (priv->offset) {
+ case 0: /* store to dscp must not alter ip6 version */
+ return priv->len <= 4 && ih->version == 6;
+ case 2:
+ return priv->len <= 2;
+ case offsetof(struct ipv6hdr, hop_limit):
+ return priv->len == 1;
+ case offsetof(struct ipv6hdr, saddr):
+ return priv->len <= 16 + 16;
+ case offsetof(struct ipv6hdr, daddr):
+ return priv->len <= 16;
+ }
+
+ return false;
+}
+
+static bool nft_nh_write_ok_arp(const struct nft_payload_set *priv)
+{
+ /* Variable size for standard ethernet arp */
+ const unsigned int eth_ip = 2 * (ETH_ALEN + 4);
+ unsigned int offset = priv->offset;
+
+ switch (offset) {
+ case offsetof(struct arphdr, ar_op):
+ return priv->len == 2;
+ default:
+ break;
+ }
+
+ /* permit writes post fixed arp header size. offset + len are
+ * checked vs skb size via skb_ensure_writable.
+ */
+ return offset >= sizeof(struct arphdr) && priv->len <= eth_ip;
+}
+
+static bool nft_nh_write_ok_netdev(const struct nft_pktinfo *pkt,
+ const struct nft_payload_set *priv,
+ const u32 *src)
+{
+#ifdef CONFIG_NF_TABLES_NETDEV
+ switch (pkt->skb->protocol) {
+ case htons(ETH_P_ARP):
+ return nft_nh_write_ok_arp(priv);
+ case htons(ETH_P_IP):
+ return nft_nh_write_ok_ip4(pkt, priv, src);
+ case htons(ETH_P_IPV6):
+ return nft_nh_write_ok_ip6(pkt, priv, src);
+ }
+#endif
+ /* default to false for now, relax later in case we have
+ * use-cases that need inner header manipulation for
+ * encapsulated traffic like vlan or PPPoE.
+ */
+ return false;
+}
+
+static bool nft_nh_write_ok_bridge(const struct nft_pktinfo *pkt,
+ const struct nft_payload_set *priv,
+ const u32 *src)
+{
+#if IS_ENABLED(CONFIG_NF_TABLES_BRIDGE)
+ switch (pkt->ethertype) {
+ case htons(ETH_P_ARP):
+ return nft_nh_write_ok_arp(priv);
+ case htons(ETH_P_IP):
+ return nft_nh_write_ok_ip4(pkt, priv, src);
+ case htons(ETH_P_IPV6):
+ return nft_nh_write_ok_ip6(pkt, priv, src);
+ }
+#endif
+ /* see nft_nh_write_ok_netdev: default to false */
+ return false;
+}
+
+static bool nft_nh_write_ok(const struct nft_pktinfo *pkt,
+ const struct nft_payload_set *priv,
+ const u32 *src)
+{
+ switch (pkt->state->pf) {
+ case NFPROTO_ARP:
+ return nft_nh_write_ok_arp(priv);
+ case NFPROTO_BRIDGE:
+ return nft_nh_write_ok_bridge(pkt, priv, src);
+ case NFPROTO_IPV4:
+ return nft_nh_write_ok_ip4(pkt, priv, src);
+ case NFPROTO_IPV6:
+ return nft_nh_write_ok_ip6(pkt, priv, src);
+ case NFPROTO_NETDEV:
+ if (pkt->state->hook == NF_NETDEV_INGRESS)
+ return true;
+ return nft_nh_write_ok_netdev(pkt, priv, src);
+ }
+
+ return false;
+}
+
+/* check linklayer modifications don't spill into network header. */
+static bool nft_ll_write_ok(const struct nft_pktinfo *pkt, int offset)
+{
+ if (nft_in_ingress(pkt->state))
+ return true;
+
+ return offset <= skb_network_offset(pkt->skb);
+}
+
static void nft_payload_set_eval(const struct nft_expr *expr,
struct nft_regs *regs,
const struct nft_pktinfo *pkt)
@@ -861,8 +1027,12 @@ static void nft_payload_set_eval(const struct nft_expr *expr,
}
offset = skb_mac_header(skb) - skb->data - vlan_hlen;
+ if (!nft_ll_write_ok(pkt, priv->len + priv->offset + offset))
+ goto err;
break;
case NFT_PAYLOAD_NETWORK_HEADER:
+ if (!nft_nh_write_ok(pkt, priv, src))
+ goto err;
offset = skb_network_offset(skb);
break;
case NFT_PAYLOAD_TRANSPORT_HEADER:
--
2.53.0
^ permalink raw reply related [flat|nested] 10+ messages in thread* [PATCH net 9/9] netfilter: nftables: restrict checkum update offset
2026-06-30 4:52 [PATCH net 0/9] netfilter: updates for net Florian Westphal
` (7 preceding siblings ...)
2026-06-30 4:52 ` [PATCH net 8/9] netfilter: nftables: restrict linklayer and network header writes Florian Westphal
@ 2026-06-30 4:52 ` Florian Westphal
8 siblings, 0 replies; 10+ messages in thread
From: Florian Westphal @ 2026-06-30 4:52 UTC (permalink / raw)
To: netdev
Cc: Paolo Abeni, David S. Miller, Eric Dumazet, Jakub Kicinski,
netfilter-devel, pablo
After previous patch, writes to network header are restricted.
However, there is another way to manipulate the l3 header: The
checksum update function.
Restrict this for network header writes, only the ipv4 header is
allowed. This needs run-time checks because BRIDGE, INET, NETDEV
families can carry l3 headers other than IP.
checksum updates to the udp/tcp (l4) headers are not restricted.
Signed-off-by: Florian Westphal <fw@strlen.de>
---
net/netfilter/nft_payload.c | 100 ++++++++++++++++++++++++++++++++++++
1 file changed, 100 insertions(+)
diff --git a/net/netfilter/nft_payload.c b/net/netfilter/nft_payload.c
index 9c974df59b42..391539a1ceaa 100644
--- a/net/netfilter/nft_payload.c
+++ b/net/netfilter/nft_payload.c
@@ -1000,6 +1000,83 @@ static bool nft_ll_write_ok(const struct nft_pktinfo *pkt, int offset)
return offset <= skb_network_offset(pkt->skb);
}
+static bool nft_payload_validate_inet_csum_offset(const struct nft_ctx *ctx,
+ const struct nft_payload_set *priv)
+{
+ switch (priv->base) {
+ case NFT_PAYLOAD_LL_HEADER:
+ break;
+ case NFT_PAYLOAD_NETWORK_HEADER:
+ if (ctx->family == NFPROTO_IPV4) {
+ if (offsetof(struct iphdr, check) == priv->csum_offset)
+ return true;
+
+ return false;
+ }
+ return true; /* run time validation required */
+ case NFT_PAYLOAD_TRANSPORT_HEADER:
+ if (priv->csum_flags) /* makes no sense, asks for "re-update" of L4 checksum */
+ return false;
+
+ /* no further check here; offset can't be negative so bogus
+ * offsets can corrupt L4 or payload but not l3 headers.
+ * We already allow arbitrary l4/inner payload writes.
+ */
+ return true;
+ case NFT_PAYLOAD_INNER_HEADER:
+ return true;
+ case NFT_PAYLOAD_TUN_HEADER:
+ break;
+ }
+
+ return false;
+}
+
+/* do not allow arbitrary network header mangling via bogus csum_off.
+ * We only support ipv4. Only NFPROTO_IPV4 can be checked from control
+ * plane.
+ */
+static bool nft_payload_csum_nh_write_ok(const struct nft_payload_set *priv,
+ const struct nft_pktinfo *pkt)
+{
+ switch (pkt->state->pf) {
+ case NFPROTO_IPV4:
+ /* Warning: NFPROTO_INET was not checked; we can't return true here. */
+ return priv->csum_offset == offsetof(struct iphdr, check);
+ case NFPROTO_IPV6:
+ return false;
+ case NFPROTO_BRIDGE:
+ return pkt->ethertype == htons(ETH_P_IP) &&
+ priv->csum_offset == offsetof(struct iphdr, check);
+ case NFPROTO_NETDEV:
+ return pkt->skb->protocol == htons(ETH_P_IP) &&
+ priv->csum_offset == offsetof(struct iphdr, check);
+ }
+
+ return false;
+}
+
+static bool nft_payload_csum_write_ok(const struct nft_pktinfo *pkt,
+ const struct nft_payload_set *priv)
+{
+ switch (priv->base) {
+ case NFT_PAYLOAD_LL_HEADER:
+ break;
+ case NFT_PAYLOAD_NETWORK_HEADER:
+ return nft_payload_csum_nh_write_ok(priv, pkt);
+ case NFT_PAYLOAD_TRANSPORT_HEADER:
+ case NFT_PAYLOAD_INNER_HEADER:
+ /* neither offsets are validated, offsets cannot be
+ * negative so real l3 headers cannot be mangled.
+ */
+ return true;
+ case NFT_PAYLOAD_TUN_HEADER:
+ break;
+ }
+
+ return false;
+}
+
static void nft_payload_set_eval(const struct nft_expr *expr,
struct nft_regs *regs,
const struct nft_pktinfo *pkt)
@@ -1064,6 +1141,7 @@ static void nft_payload_set_eval(const struct nft_expr *expr,
tsum = csum_partial(src, priv->len, 0);
if (priv->csum_type == NFT_PAYLOAD_CSUM_INET &&
+ nft_payload_csum_write_ok(pkt, priv) &&
nft_payload_csum_inet(skb, src, fsum, tsum, csum_offset))
goto err;
@@ -1130,7 +1208,26 @@ static int nft_payload_set_init(const struct nft_ctx *ctx,
switch (csum_type) {
case NFT_PAYLOAD_CSUM_NONE:
+ if (priv->csum_offset) /* nonsensical */
+ return -EINVAL;
+
+ if (priv->csum_flags == 0)
+ break;
+
+ /* Userspace requests L4 checksum update, e.g.:
+ * - IPv6 stateless NAT (no l3 csum)
+ * - transport header mangling
+ * - inner data mangling
+ */
+ if (priv->base == NFT_PAYLOAD_NETWORK_HEADER ||
+ priv->base == NFT_PAYLOAD_TRANSPORT_HEADER ||
+ priv->base == NFT_PAYLOAD_INNER_HEADER)
+ break;
+
+ return -EINVAL;
case NFT_PAYLOAD_CSUM_INET:
+ if (!nft_payload_validate_inet_csum_offset(ctx, priv))
+ return -EINVAL;
break;
case NFT_PAYLOAD_CSUM_SCTP:
if (priv->base != NFT_PAYLOAD_TRANSPORT_HEADER)
@@ -1138,6 +1235,9 @@ static int nft_payload_set_init(const struct nft_ctx *ctx,
if (priv->csum_offset != offsetof(struct sctphdr, checksum))
return -EINVAL;
+
+ if (priv->csum_flags)
+ return -EINVAL;
break;
default:
return -EOPNOTSUPP;
--
2.53.0
^ permalink raw reply related [flat|nested] 10+ messages in thread