* [PATCH nf-next 1/2] netfilter: nfnetlink_queue: do shared-unconfirmed check before segmentation
@ 2026-01-26 13:49 Florian Westphal
2026-01-26 13:49 ` [PATCH nf-next 2/2] selftests: netfilter: nft_queue.sh: add udp fraglist gro test case Florian Westphal
0 siblings, 1 reply; 2+ messages in thread
From: Florian Westphal @ 2026-01-26 13:49 UTC (permalink / raw)
To: netfilter-devel; +Cc: Florian Westphal, Ulrich Weber
Ulrich reports a regression with nfqueue:
If an application did not set the 'F_GSO' capability flag and a gso
packet with an unconfirmed nf_conn entry is received all packets are
now dropped instead of queued, because the check happens after
skb_gso_segment(). In that case, we did have exclusive ownership
of the skb and its associated conntrack entry. The elevated use
count is due to skb_clone happening via skb_gso_segment().
Move the check so that its peformed vs. the aggregated packet.
Then, annotate the individual segments except the first one so we
can do a 2nd check at reinject time.
For the normal case, where userspace does in-order reinjects, this avoids
packet drops: first reinjected segment continues traversal and confirms
entry, remaining segments observe the confirmed entry.
While at it, simplify nf_ct_drop_unconfirmed(): We only care about
unconfirmed entries with a refcnt > 1, there is no need to special-case
dying entries.
This only happens with UDP. With TCP, the only unconfirmed packet will
be the TCP SYN, those aren't aggregated by GRO.
Next patch adds a udpgro test case to cover this scenario.
Reported-by: Ulrich Weber <ulrich.weber@gmail.com>
Fixes: 7d8dc1c7be8d ("netfilter: nf_queue: drop packets with cloned unconfirmed conntracks")
Signed-off-by: Florian Westphal <fw@strlen.de>
---
NB: This is on top of 'netfilter: nfnetlink_queue: optimize verdict lookup with hash table',
it won't apply as-is to main branch.
include/net/netfilter/nf_queue.h | 1 +
net/netfilter/nfnetlink_queue.c | 119 ++++++++++++++++++-------------
2 files changed, 69 insertions(+), 51 deletions(-)
diff --git a/include/net/netfilter/nf_queue.h b/include/net/netfilter/nf_queue.h
index e6803831d6af..70dac4ab2f35 100644
--- a/include/net/netfilter/nf_queue.h
+++ b/include/net/netfilter/nf_queue.h
@@ -21,6 +21,7 @@ struct nf_queue_entry {
struct net_device *physout;
#endif
struct nf_hook_state state;
+ bool nf_ct_was_unconfirmed;
u16 size; /* sizeof(entry) + saved route keys */
u16 queue_num;
diff --git a/net/netfilter/nfnetlink_queue.c b/net/netfilter/nfnetlink_queue.c
index 671b52c652ef..930b0e534d1e 100644
--- a/net/netfilter/nfnetlink_queue.c
+++ b/net/netfilter/nfnetlink_queue.c
@@ -435,6 +435,33 @@ static void nf_reinject(struct nf_queue_entry *entry, unsigned int verdict)
nf_queue_entry_free(entry);
}
+/* return true if the entry has an unconfirmed conntrack attached that isn't owned by us
+ * exclusively.
+ */
+static bool nf_ct_drop_unconfirmed(const struct nf_queue_entry *entry, bool *is_unconfirmed)
+{
+#if IS_ENABLED(CONFIG_NF_CONNTRACK)
+ struct nf_conn *ct = (void *)skb_nfct(entry->skb);
+
+ if (!ct || nf_ct_is_confirmed(ct))
+ return false;
+
+ *is_unconfirmed = true;
+
+ /* in some cases skb_clone() can occur after initial conntrack
+ * pickup, but conntrack assumes exclusive skb->_nfct ownership for
+ * unconfirmed entries.
+ *
+ * This happens for br_netfilter and with ip multicast routing.
+ * This can't be solved with serialization here because one clone
+ * could have been queued for local delivery or could be transmitted
+ * in parallel on another CPU.
+ */
+ return refcount_read(&ct->ct_general.use) > 1;
+#endif
+ return false;
+}
+
static void nfqnl_reinject(struct nf_queue_entry *entry, unsigned int verdict)
{
const struct nf_ct_hook *ct_hook;
@@ -462,6 +489,26 @@ static void nfqnl_reinject(struct nf_queue_entry *entry, unsigned int verdict)
break;
}
}
+
+ if (verdict != NF_DROP && entry->nf_ct_was_unconfirmed) {
+ bool is_unconfirmed = false;
+
+ /* If first queued segment was already reinjected then
+ * there is a good chance the ct entry is now confirmed.
+ *
+ * Handle the rare cases:
+ * - out-of-order verdict
+ * - threaded userspace reinjecting in parallel
+ * - first segment was dropped
+ *
+ * In all of those cases we can't handle this packet
+ * because we can't be sure that another CPU won't modify
+ * nf_conn->ext in parallel which isn't allowed.
+ */
+ if (nf_ct_drop_unconfirmed(entry, &is_unconfirmed))
+ verdict = NF_DROP;
+ }
+
nf_reinject(entry, verdict);
}
@@ -891,49 +938,6 @@ nfqnl_build_packet_message(struct net *net, struct nfqnl_instance *queue,
return NULL;
}
-static bool nf_ct_drop_unconfirmed(const struct nf_queue_entry *entry)
-{
-#if IS_ENABLED(CONFIG_NF_CONNTRACK)
- static const unsigned long flags = IPS_CONFIRMED | IPS_DYING;
- struct nf_conn *ct = (void *)skb_nfct(entry->skb);
- unsigned long status;
- unsigned int use;
-
- if (!ct)
- return false;
-
- status = READ_ONCE(ct->status);
- if ((status & flags) == IPS_DYING)
- return true;
-
- if (status & IPS_CONFIRMED)
- return false;
-
- /* in some cases skb_clone() can occur after initial conntrack
- * pickup, but conntrack assumes exclusive skb->_nfct ownership for
- * unconfirmed entries.
- *
- * This happens for br_netfilter and with ip multicast routing.
- * We can't be solved with serialization here because one clone could
- * have been queued for local delivery.
- */
- use = refcount_read(&ct->ct_general.use);
- if (likely(use == 1))
- return false;
-
- /* Can't decrement further? Exclusive ownership. */
- if (!refcount_dec_not_one(&ct->ct_general.use))
- return false;
-
- skb_set_nfct(entry->skb, 0);
- /* No nf_ct_put(): we already decremented .use and it cannot
- * drop down to 0.
- */
- return true;
-#endif
- return false;
-}
-
static int
__nfqnl_enqueue_packet(struct net *net, struct nfqnl_instance *queue,
struct nf_queue_entry *entry)
@@ -950,9 +954,6 @@ __nfqnl_enqueue_packet(struct net *net, struct nfqnl_instance *queue,
}
spin_lock_bh(&queue->lock);
- if (nf_ct_drop_unconfirmed(entry))
- goto err_out_free_nskb;
-
if (queue->queue_total >= queue->queue_maxlen)
goto err_out_queue_drop;
@@ -995,7 +996,6 @@ __nfqnl_enqueue_packet(struct net *net, struct nfqnl_instance *queue,
else
net_warn_ratelimited("nf_queue: hash insert failed: %d\n", err);
}
-err_out_free_nskb:
kfree_skb(nskb);
err_out_unlock:
spin_unlock_bh(&queue->lock);
@@ -1074,9 +1074,10 @@ __nfqnl_enqueue_packet_gso(struct net *net, struct nfqnl_instance *queue,
static int
nfqnl_enqueue_packet(struct nf_queue_entry *entry, unsigned int queuenum)
{
- unsigned int queued;
- struct nfqnl_instance *queue;
struct sk_buff *skb, *segs, *nskb;
+ bool ct_is_unconfirmed = false;
+ struct nfqnl_instance *queue;
+ unsigned int queued;
int err = -ENOBUFS;
struct net *net = entry->state.net;
struct nfnl_queue_net *q = nfnl_queue_pernet(net);
@@ -1100,6 +1101,15 @@ nfqnl_enqueue_packet(struct nf_queue_entry *entry, unsigned int queuenum)
break;
}
+ /* Check if someone already holds another reference to
+ * unconfirmed ct. If so, we cannot queue the skb:
+ * concurrent modifications of nf_conn->ext are not
+ * allowed and we can't know if another CPU isn't
+ * processing the same nf_conn entry in parallel.
+ */
+ if (nf_ct_drop_unconfirmed(entry, &ct_is_unconfirmed))
+ return -EINVAL;
+
if (!skb_is_gso(skb) || ((queue->flags & NFQA_CFG_F_GSO) && !skb_is_gso_sctp(skb)))
return __nfqnl_enqueue_packet(net, queue, entry);
@@ -1117,10 +1127,17 @@ nfqnl_enqueue_packet(struct nf_queue_entry *entry, unsigned int queuenum)
if (err == 0)
err = __nfqnl_enqueue_packet_gso(net, queue,
segs, entry);
- if (err == 0)
+ if (err == 0) {
queued++;
- else
+ /* skb_gso_segment() caused increment of ct refcount.
+ * Annotate this for all queued entries except the first one
+ * queued. As long as the first one is reinjected first it
+ * will do the confirmation for us.
+ */
+ entry->nf_ct_was_unconfirmed = ct_is_unconfirmed;
+ } else {
kfree_skb(segs);
+ }
}
if (queued) {
--
2.52.0
^ permalink raw reply related [flat|nested] 2+ messages in thread
* [PATCH nf-next 2/2] selftests: netfilter: nft_queue.sh: add udp fraglist gro test case
2026-01-26 13:49 [PATCH nf-next 1/2] netfilter: nfnetlink_queue: do shared-unconfirmed check before segmentation Florian Westphal
@ 2026-01-26 13:49 ` Florian Westphal
0 siblings, 0 replies; 2+ messages in thread
From: Florian Westphal @ 2026-01-26 13:49 UTC (permalink / raw)
To: netfilter-devel; +Cc: Florian Westphal
Without the preceding patch, this fails with:
FAIL: test_udp_gro_ct: Expected udp conntrack entry
FAIL: test_udp_gro_ct: Expected software segmentation to occur, had 10 and 0
Signed-off-by: Florian Westphal <fw@strlen.de>
---
.../selftests/net/netfilter/nft_queue.sh | 142 +++++++++++++++++-
1 file changed, 136 insertions(+), 6 deletions(-)
diff --git a/tools/testing/selftests/net/netfilter/nft_queue.sh b/tools/testing/selftests/net/netfilter/nft_queue.sh
index 6136ceec45e0..139bc1211878 100755
--- a/tools/testing/selftests/net/netfilter/nft_queue.sh
+++ b/tools/testing/selftests/net/netfilter/nft_queue.sh
@@ -510,7 +510,7 @@ EOF
udp_listener_ready()
{
- ss -S -N "$1" -uln -o "sport = :12345" | grep -q 12345
+ ss -S -N "$1" -uln -o "sport = :$2" | grep -q "$2"
}
output_files_written()
@@ -518,7 +518,7 @@ output_files_written()
test -s "$1" && test -s "$2"
}
-test_udp_ct_race()
+test_udp_nat_race()
{
ip netns exec "$nsrouter" nft -f /dev/stdin <<EOF
flush ruleset
@@ -545,8 +545,8 @@ EOF
ip netns exec "$nsrouter" ./nf_queue -q 12 -d 1000 &
local nfqpid=$!
- busywait "$BUSYWAIT_TIMEOUT" udp_listener_ready "$ns2"
- busywait "$BUSYWAIT_TIMEOUT" udp_listener_ready "$ns3"
+ busywait "$BUSYWAIT_TIMEOUT" udp_listener_ready "$ns2" 12345
+ busywait "$BUSYWAIT_TIMEOUT" udp_listener_ready "$ns3" 12345
busywait "$BUSYWAIT_TIMEOUT" nf_queue_wait "$nsrouter" 12
# Send two packets, one should end up in ns1, other in ns2.
@@ -557,7 +557,7 @@ EOF
busywait 10000 output_files_written "$TMPFILE1" "$TMPFILE2"
- kill "$nfqpid"
+ kill "$nfqpid" "$rpid1" "$rpid2"
if ! ip netns exec "$nsrouter" bash -c 'conntrack -L -p udp --dport 12345 2>/dev/null | wc -l | grep -q "^1"'; then
echo "FAIL: Expected One udp conntrack entry"
@@ -585,6 +585,135 @@ EOF
echo "PASS: both udp receivers got one packet each"
}
+# Make sure UDPGRO aggregated packets don't lose
+# their skb->nfct entry when nfqueue passes the
+# skb to userspace with software gso segmentation on.
+test_udp_gro_ct()
+{
+ local errprefix="FAIL: test_udp_gro_ct:"
+
+ ip netns exec "$nsrouter" conntrack -F 2>/dev/null
+
+ ip netns exec "$nsrouter" nft -f /dev/stdin <<EOF
+flush ruleset
+table inet udpq {
+ # Number of packets/bytes queued to userspace
+ counter toqueue { }
+ # Number of packets/bytes reinjected from userspace with 'ct new' intact
+ counter fromqueue { }
+ # These two counters should be identical and not 0.
+
+ chain prerouting {
+ type filter hook prerouting priority -300; policy accept;
+
+ # userspace sends small packets, if < 1000, UDPGRO did
+ # not kick in, but test needs a 'new' conntrack with udpgro skb.
+ meta iifname veth0 meta l4proto udp meta length > 1000 accept
+
+ # don't pick up non-gso packets and don't queue them to
+ # userspace.
+ notrack
+ }
+
+ chain postrouting {
+ type filter hook postrouting priority 0; policy accept;
+
+ # Only queue unconfirmed fraglist gro skbs to userspace.
+ udp dport 12346 ct status ! confirmed counter name "toqueue" mark set 1 queue num 1
+ }
+
+ chain validate {
+ type filter hook postrouting priority 1; policy accept;
+ # ... and only count those that were reinjected with the
+ # skb->nfct intact.
+ mark 1 counter name "fromqueue"
+ }
+}
+EOF
+ timeout 10 ip netns exec "$ns2" socat UDP-LISTEN:12346,fork,pf=ipv4 OPEN:"$TMPFILE1",trunc &
+ local rpid=$!
+
+ ip netns exec "$nsrouter" ./nf_queue -G -c -q 1 -t 2 > "$TMPFILE2" &
+ local nfqpid=$!
+
+ ip netns exec "$nsrouter" ethtool -K "veth0" rx-udp-gro-forwarding on rx-gro-list on generic-receive-offload on
+
+ busywait "$BUSYWAIT_TIMEOUT" udp_listener_ready "$ns2" 12346
+ busywait "$BUSYWAIT_TIMEOUT" nf_queue_wait "$nsrouter" 1
+
+ local bs=512
+ local count=$(((32 * 1024 * 1024) / bs))
+ dd if=/dev/zero bs="$bs" count="$count" 2>/dev/null | for i in $(seq 1 16); do
+ timeout 5 ip netns exec "$ns1" \
+ socat -u -b 512 STDIN UDP-DATAGRAM:10.0.2.99:12346,reuseport,bind=0.0.0.0:55221 &
+ done
+
+ busywait 10000 test -s "$TMPFILE1"
+
+ kill "$rpid"
+
+ wait
+
+ local p
+ local b
+ local pqueued
+ local bqueued
+
+ c=$(ip netns exec "$nsrouter" nft list counter inet udpq "toqueue" | grep packets)
+ read p pqueued b bqueued <<EOF
+$c
+EOF
+ local preinject
+ local breinject
+ c=$(ip netns exec "$nsrouter" nft list counter inet udpq "fromqueue" | grep packets)
+ read p preinject b breinject <<EOF
+$c
+EOF
+ ip netns exec "$nsrouter" ethtool -K "veth0" rx-udp-gro-forwarding off
+ ip netns exec "$nsrouter" ethtool -K "veth1" rx-udp-gro-forwarding off
+
+ if [ "$pqueued" -eq 0 ];then
+ # happens when gro did not build at least on aggregate
+ echo "SKIP: No packets were queued"
+ return
+ fi
+
+ local saw_ct_entry=0
+ if ip netns exec "$nsrouter" bash -c 'conntrack -L -p udp --dport 12346 2>/dev/null | wc -l | grep -q "^1"'; then
+ saw_ct_entry=1
+ else
+ echo "$errprefix Expected udp conntrack entry"
+ ip netns exec "$nsrouter" conntrack -L
+ ret=1
+ fi
+
+ if [ "$pqueued" -ge "$preinject" ] ;then
+ echo "$errprefix Expected software segmentation to occur, had $pqueued and $preinject"
+ ret=1
+ return
+ fi
+
+ # sw segmentation adds extra udp and ip headers.
+ local breinject_expect=$((preinject * (512 + 20 + 8)))
+
+ if [ "$breinject" -eq "$breinject_expect" ]; then
+ if [ "$saw_ct_entry" -eq 1 ];then
+ echo "PASS: fraglist gro skb passed with conntrack entry"
+ else
+ echo "$errprefix fraglist gro skb passed without conntrack entry"
+ ret=1
+ fi
+ else
+ echo "$errprefix Counter mismatch, conntrack entry dropped by nfqueue? Queued: $pqueued, $bqueued. Post-queue: $preinject, $breinject. Expected $breinject_expect"
+ ret=1
+ fi
+
+ if ! ip netns exec "$nsrouter" nft delete table inet udpq; then
+ echo "$errprefix: Could not delete udpq table"
+ ret=1
+ fi
+}
+
test_queue_removal()
{
read tainted_then < /proc/sys/kernel/tainted
@@ -663,7 +792,8 @@ test_tcp_localhost_connectclose
test_tcp_localhost_requeue
test_sctp_forward
test_sctp_output
-test_udp_ct_race
+test_udp_nat_race
+test_udp_gro_ct
# should be last, adds vrf device in ns1 and changes routes
test_icmp_vrf
--
2.52.0
^ permalink raw reply related [flat|nested] 2+ messages in thread
end of thread, other threads:[~2026-01-26 13:50 UTC | newest]
Thread overview: 2+ messages (download: mbox.gz follow: Atom feed
-- links below jump to the message on this page --
2026-01-26 13:49 [PATCH nf-next 1/2] netfilter: nfnetlink_queue: do shared-unconfirmed check before segmentation Florian Westphal
2026-01-26 13:49 ` [PATCH nf-next 2/2] selftests: netfilter: nft_queue.sh: add udp fraglist gro test case Florian Westphal
This is a public inbox, see mirroring instructions
for how to clone and mirror all data and code used for this inbox