* nfnetlink_queue crashes kernel
@ 2026-04-03 12:22 Florian Westphal
2026-04-03 13:45 ` Florian Westphal
` (2 more replies)
0 siblings, 3 replies; 12+ messages in thread
From: Florian Westphal @ 2026-04-03 12:22 UTC (permalink / raw)
To: Scott Mitchell; +Cc: netfilter-devel
Hello,
nfnetlink_queue is currently broken in at least two ways.
1). kernel will crash under pressure because
struct nf_queue_entry is freed via kfree, but parallel
cpu can still observe this while walking the (global) rhashtable.
2) a4400a5b343d ("netfilter: nfnetlink_queue: nfqnl_instance GFP_ATOMIC -> GFP_KERNEL_ACCOUNT allocation") should have updated the spinlock to use the _bh variant, if the queue exists we risk deadlock via softirq recursion.
Minimal fix, that I am not a fan of:
diff --git a/include/net/netfilter/nf_queue.h b/include/net/netfilter/nf_queue.h
--- a/include/net/netfilter/nf_queue.h
+++ b/include/net/netfilter/nf_queue.h
@@ -24,6 +24,7 @@ struct nf_queue_entry {
bool nf_ct_is_unconfirmed;
u16 size; /* sizeof(entry) + saved route keys */
u16 queue_num;
+ struct rcu_head head;
/* extra space to store route keys */
};
diff --git a/net/netfilter/nf_queue.c b/net/netfilter/nf_queue.c
index 7f12e56e6e52..385d1fe704ae 100644
--- a/net/netfilter/nf_queue.c
+++ b/net/netfilter/nf_queue.c
@@ -74,7 +74,7 @@ static void nf_queue_entry_release_refs(struct nf_queue_entry *entry)
void nf_queue_entry_free(struct nf_queue_entry *entry)
{
nf_queue_entry_release_refs(entry);
- kfree(entry);
+ kfree_rcu(entry, head);
}
EXPORT_SYMBOL_GPL(nf_queue_entry_free);
diff --git a/net/netfilter/nfnetlink_queue.c b/net/netfilter/nfnetlink_queue.c
index 47f7f62906e2..fc44ea4e5128 100644
--- a/net/netfilter/nfnetlink_queue.c
+++ b/net/netfilter/nfnetlink_queue.c
@@ -190,7 +190,7 @@ instance_create(struct nfnl_queue_net *q, u_int16_t queue_num, u32 portid)
spin_lock_init(&inst->lock);
INIT_LIST_HEAD(&inst->queue_list);
- spin_lock(&q->instances_lock);
+ spin_lock_bh(&q->instances_lock);
if (instance_lookup(q, queue_num)) {
err = -EEXIST;
goto out_unlock;
@@ -204,7 +204,7 @@ instance_create(struct nfnl_queue_net *q, u_int16_t queue_num, u32 portid)
h = instance_hashfn(queue_num);
hlist_add_head_rcu(&inst->hlist, &q->instance_table[h]);
- spin_unlock(&q->instances_lock);
+ spin_unlock_bh(&q->instances_lock);
return inst;
A probably better fix is to make the rhashtable perqueue, which is
much more intrusive at this late stage.
I prefer to revert both changes and not accept a reworked hashtable
until we have an extension to nft_queue.sh selftest.
Anything using queue balancing (pacing packets to n queues) with
enough streams to have parallel invocation of the rhashtable should
demonstrate the crash.
Thoughts?
^ permalink raw reply related [flat|nested] 12+ messages in thread* Re: nfnetlink_queue crashes kernel 2026-04-03 12:22 nfnetlink_queue crashes kernel Florian Westphal @ 2026-04-03 13:45 ` Florian Westphal 2026-04-03 15:55 ` Scott Mitchell ` (2 more replies) 2026-04-03 13:59 ` Florian Westphal 2026-04-03 14:02 ` Pablo Neira Ayuso 2 siblings, 3 replies; 12+ messages in thread From: Florian Westphal @ 2026-04-03 13:45 UTC (permalink / raw) To: Scott Mitchell; +Cc: netfilter-devel Florian Westphal <fw@strlen.de> wrote: > A probably better fix is to make the rhashtable perqueue, which is > much more intrusive at this late stage. Tentative patch to do this, still misses selftest extensions: diff --git a/net/netfilter/nfnetlink_queue.c b/net/netfilter/nfnetlink_queue.c index 47f7f62906e2..15c6276f6592 100644 --- a/net/netfilter/nfnetlink_queue.c +++ b/net/netfilter/nfnetlink_queue.c @@ -60,29 +60,10 @@ */ #define NFQNL_MAX_COPY_RANGE (0xffff - NLA_HDRLEN) -/* Composite key for packet lookup: (net, queue_num, packet_id) */ -struct nfqnl_packet_key { - possible_net_t net; - u32 packet_id; - u16 queue_num; -} __aligned(sizeof(u32)); /* jhash2 requires 32-bit alignment */ - -/* Global rhashtable - one for entire system, all netns */ -static struct rhashtable nfqnl_packet_map __read_mostly; - -/* Helper to initialize composite key */ -static inline void nfqnl_init_key(struct nfqnl_packet_key *key, - struct net *net, u32 packet_id, u16 queue_num) -{ - memset(key, 0, sizeof(*key)); - write_pnet(&key->net, net); - key->packet_id = packet_id; - key->queue_num = queue_num; -} - struct nfqnl_instance { struct hlist_node hlist; /* global list of queues */ - struct rcu_head rcu; + struct rhashtable nfqnl_packet_map; + struct rcu_work rwork; u32 peer_portid; unsigned int queue_maxlen; @@ -106,6 +87,7 @@ struct nfqnl_instance { typedef int (*nfqnl_cmpfn)(struct nf_queue_entry *, unsigned long); +static struct workqueue_struct *nfq_cleanup_wq __read_mostly; static unsigned int nfnl_queue_net_id __read_mostly; #define INSTANCE_BUCKETS 16 @@ -124,34 +106,10 @@ static inline u_int8_t instance_hashfn(u_int16_t queue_num) return ((queue_num >> 8) ^ queue_num) % INSTANCE_BUCKETS; } -/* Extract composite key from nf_queue_entry for hashing */ -static u32 nfqnl_packet_obj_hashfn(const void *data, u32 len, u32 seed) -{ - const struct nf_queue_entry *entry = data; - struct nfqnl_packet_key key; - - nfqnl_init_key(&key, entry->state.net, entry->id, entry->queue_num); - - return jhash2((u32 *)&key, sizeof(key) / sizeof(u32), seed); -} - -/* Compare stack-allocated key against entry */ -static int nfqnl_packet_obj_cmpfn(struct rhashtable_compare_arg *arg, - const void *obj) -{ - const struct nfqnl_packet_key *key = arg->key; - const struct nf_queue_entry *entry = obj; - - return !net_eq(entry->state.net, read_pnet(&key->net)) || - entry->queue_num != key->queue_num || - entry->id != key->packet_id; -} - static const struct rhashtable_params nfqnl_rhashtable_params = { .head_offset = offsetof(struct nf_queue_entry, hash_node), - .key_len = sizeof(struct nfqnl_packet_key), - .obj_hashfn = nfqnl_packet_obj_hashfn, - .obj_cmpfn = nfqnl_packet_obj_cmpfn, + .key_offset = offsetof(struct nf_queue_entry, id), + .key_len = sizeof(u32), .automatic_shrinking = true, .min_size = NFQNL_HASH_MIN, .max_size = NFQNL_HASH_MAX, @@ -190,7 +148,11 @@ instance_create(struct nfnl_queue_net *q, u_int16_t queue_num, u32 portid) spin_lock_init(&inst->lock); INIT_LIST_HEAD(&inst->queue_list); - spin_lock(&q->instances_lock); + err = rhashtable_init(&inst->nfqnl_packet_map, &nfqnl_rhashtable_params); + if (err < 0) + goto out_free; + + spin_lock_bh(&q->instances_lock); if (instance_lookup(q, queue_num)) { err = -EEXIST; goto out_unlock; @@ -204,12 +166,14 @@ instance_create(struct nfnl_queue_net *q, u_int16_t queue_num, u32 portid) h = instance_hashfn(queue_num); hlist_add_head_rcu(&inst->hlist, &q->instance_table[h]); - spin_unlock(&q->instances_lock); + spin_unlock_bh(&q->instances_lock); return inst; out_unlock: - spin_unlock(&q->instances_lock); + spin_unlock_bh(&q->instances_lock); + rhashtable_destroy(&inst->nfqnl_packet_map); +out_free: kfree(inst); return ERR_PTR(err); } @@ -217,15 +181,18 @@ instance_create(struct nfnl_queue_net *q, u_int16_t queue_num, u32 portid) static void nfqnl_flush(struct nfqnl_instance *queue, nfqnl_cmpfn cmpfn, unsigned long data); -static void -instance_destroy_rcu(struct rcu_head *head) +static void instance_destroy_work(struct work_struct *work) { - struct nfqnl_instance *inst = container_of(head, struct nfqnl_instance, - rcu); + struct nfqnl_instance *inst; + inst = container_of(to_rcu_work(work), struct nfqnl_instance, + rwork); rcu_read_lock(); nfqnl_flush(inst, NULL, 0); rcu_read_unlock(); + + rhashtable_destroy(&inst->nfqnl_packet_map); + kfree(inst); module_put(THIS_MODULE); } @@ -234,7 +201,9 @@ static void __instance_destroy(struct nfqnl_instance *inst) { hlist_del_rcu(&inst->hlist); - call_rcu(&inst->rcu, instance_destroy_rcu); + + INIT_RCU_WORK(&inst->rwork, instance_destroy_work); + queue_rcu_work(nfq_cleanup_wq, &inst->rwork); } static void @@ -250,9 +219,7 @@ __enqueue_entry(struct nfqnl_instance *queue, struct nf_queue_entry *entry) { int err; - entry->queue_num = queue->queue_num; - - err = rhashtable_insert_fast(&nfqnl_packet_map, &entry->hash_node, + err = rhashtable_insert_fast(&queue->nfqnl_packet_map, &entry->hash_node, nfqnl_rhashtable_params); if (unlikely(err)) return err; @@ -266,23 +233,19 @@ __enqueue_entry(struct nfqnl_instance *queue, struct nf_queue_entry *entry) static void __dequeue_entry(struct nfqnl_instance *queue, struct nf_queue_entry *entry) { - rhashtable_remove_fast(&nfqnl_packet_map, &entry->hash_node, + rhashtable_remove_fast(&queue->nfqnl_packet_map, &entry->hash_node, nfqnl_rhashtable_params); list_del(&entry->list); queue->queue_total--; } static struct nf_queue_entry * -find_dequeue_entry(struct nfqnl_instance *queue, unsigned int id, - struct net *net) +find_dequeue_entry(struct nfqnl_instance *queue, unsigned int id) { - struct nfqnl_packet_key key; struct nf_queue_entry *entry; - nfqnl_init_key(&key, net, id, queue->queue_num); - spin_lock_bh(&queue->lock); - entry = rhashtable_lookup_fast(&nfqnl_packet_map, &key, + entry = rhashtable_lookup_fast(&queue->nfqnl_packet_map, &id, nfqnl_rhashtable_params); if (entry) @@ -1531,7 +1494,7 @@ static int nfqnl_recv_verdict(struct sk_buff *skb, const struct nfnl_info *info, verdict = ntohl(vhdr->verdict); - entry = find_dequeue_entry(queue, ntohl(vhdr->id), info->net); + entry = find_dequeue_entry(queue, ntohl(vhdr->id)); if (entry == NULL) return -ENOENT; @@ -1880,40 +1843,38 @@ static int __init nfnetlink_queue_init(void) { int status; - status = rhashtable_init(&nfqnl_packet_map, &nfqnl_rhashtable_params); - if (status < 0) - return status; + nfq_cleanup_wq = alloc_ordered_workqueue("nfq_workqueue", 0); + if (!nfq_cleanup_wq) + return -ENOMEM; status = register_pernet_subsys(&nfnl_queue_net_ops); - if (status < 0) { - pr_err("failed to register pernet ops\n"); - goto cleanup_rhashtable; - } + if (status < 0) + goto cleanup_pernet_subsys; - netlink_register_notifier(&nfqnl_rtnl_notifier); - status = nfnetlink_subsys_register(&nfqnl_subsys); - if (status < 0) { - pr_err("failed to create netlink socket\n"); - goto cleanup_netlink_notifier; - } + status = netlink_register_notifier(&nfqnl_rtnl_notifier); + if (status < 0) + goto cleanup_rtnl_notifier; status = register_netdevice_notifier(&nfqnl_dev_notifier); - if (status < 0) { - pr_err("failed to register netdevice notifier\n"); - goto cleanup_netlink_subsys; - } + if (status < 0) + goto cleanup_dev_notifier; + + status = nfnetlink_subsys_register(&nfqnl_subsys); + if (status < 0) + goto cleanup_nfqnl_subsys; nf_register_queue_handler(&nfqh); return status; -cleanup_netlink_subsys: - nfnetlink_subsys_unregister(&nfqnl_subsys); -cleanup_netlink_notifier: +cleanup_nfqnl_subsys: + netlink_unregister_notifier(&nfqnl_dev_notifier); +cleanup_dev_notifier: netlink_unregister_notifier(&nfqnl_rtnl_notifier); +cleanup_rtnl_notifier: unregister_pernet_subsys(&nfnl_queue_net_ops); -cleanup_rhashtable: - rhashtable_destroy(&nfqnl_packet_map); +cleanup_pernet_subsys: + destroy_workqueue(nfq_cleanup_wq); return status; } @@ -1924,9 +1885,7 @@ static void __exit nfnetlink_queue_fini(void) nfnetlink_subsys_unregister(&nfqnl_subsys); netlink_unregister_notifier(&nfqnl_rtnl_notifier); unregister_pernet_subsys(&nfnl_queue_net_ops); - - rhashtable_destroy(&nfqnl_packet_map); - + destroy_workqueue(nfq_cleanup_wq); rcu_barrier(); /* Wait for completion of call_rcu()'s */ } ^ permalink raw reply related [flat|nested] 12+ messages in thread
* Re: nfnetlink_queue crashes kernel 2026-04-03 13:45 ` Florian Westphal @ 2026-04-03 15:55 ` Scott Mitchell 2026-04-03 19:14 ` Florian Westphal 2026-04-03 23:57 ` Fernando Fernandez Mancera 2026-04-06 14:01 ` Fernando Fernandez Mancera 2 siblings, 1 reply; 12+ messages in thread From: Scott Mitchell @ 2026-04-03 15:55 UTC (permalink / raw) To: Florian Westphal; +Cc: netfilter-devel > diff --git a/net/netfilter/nfnetlink_queue.c b/net/netfilter/nfnetlink_queue.c > index 47f7f62906e2..15c6276f6592 100644 > --- a/net/netfilter/nfnetlink_queue.c > +++ b/net/netfilter/nfnetlink_queue.c > @@ -60,29 +60,10 @@ > */ > #define NFQNL_MAX_COPY_RANGE (0xffff - NLA_HDRLEN) NFQNL_HASH_MIN (1024) and NFQNL_HASH_MAX(1048576) were set when the table was global, but if table is moved to per queue it can likely be reduced. Suggested values: #define NFQNL_HASH_MIN 8 #define NFQNL_HASH_MAX 32768 > > -cleanup_netlink_subsys: > - nfnetlink_subsys_unregister(&nfqnl_subsys); > -cleanup_netlink_notifier: > +cleanup_nfqnl_subsys: > + netlink_unregister_notifier(&nfqnl_dev_notifier); Should `netlink_unregister_notifier(&nfqnl_dev_notifier);` be 'unregister_netdevice_notifier(&nfqnl_dev_notifier);' ? > +cleanup_dev_notifier: > netlink_unregister_notifier(&nfqnl_rtnl_notifier); > +cleanup_rtnl_notifier: > unregister_pernet_subsys(&nfnl_queue_net_ops); > -cleanup_rhashtable: > - rhashtable_destroy(&nfqnl_packet_map); > +cleanup_pernet_subsys: > + destroy_workqueue(nfq_cleanup_wq); > return status; > } ^ permalink raw reply [flat|nested] 12+ messages in thread
* Re: nfnetlink_queue crashes kernel 2026-04-03 15:55 ` Scott Mitchell @ 2026-04-03 19:14 ` Florian Westphal 0 siblings, 0 replies; 12+ messages in thread From: Florian Westphal @ 2026-04-03 19:14 UTC (permalink / raw) To: Scott Mitchell; +Cc: netfilter-devel Scott Mitchell <scott.k.mitch1@gmail.com> wrote: > > diff --git a/net/netfilter/nfnetlink_queue.c b/net/netfilter/nfnetlink_queue.c > > index 47f7f62906e2..15c6276f6592 100644 > > --- a/net/netfilter/nfnetlink_queue.c > > +++ b/net/netfilter/nfnetlink_queue.c > > @@ -60,29 +60,10 @@ > > */ > > #define NFQNL_MAX_COPY_RANGE (0xffff - NLA_HDRLEN) > > NFQNL_HASH_MIN (1024) and NFQNL_HASH_MAX(1048576) were set when the > table was global, but if table is moved to per queue it can likely be > reduced. Suggested values: > > #define NFQNL_HASH_MIN 8 > #define NFQNL_HASH_MAX 32768 Changed, thanks. > Should `netlink_unregister_notifier(&nfqnl_dev_notifier);` be > 'unregister_netdevice_notifier(&nfqnl_dev_notifier);' ? Yes, thanks. ^ permalink raw reply [flat|nested] 12+ messages in thread
* Re: nfnetlink_queue crashes kernel 2026-04-03 13:45 ` Florian Westphal 2026-04-03 15:55 ` Scott Mitchell @ 2026-04-03 23:57 ` Fernando Fernandez Mancera 2026-04-04 9:40 ` Florian Westphal 2026-04-06 14:01 ` Fernando Fernandez Mancera 2 siblings, 1 reply; 12+ messages in thread From: Fernando Fernandez Mancera @ 2026-04-03 23:57 UTC (permalink / raw) To: Florian Westphal, Scott Mitchell; +Cc: netfilter-devel On 4/3/26 3:45 PM, Florian Westphal wrote: > Florian Westphal <fw@strlen.de> wrote: >> A probably better fix is to make the rhashtable perqueue, which is >> much more intrusive at this late stage. > > Tentative patch to do this, still misses selftest extensions: > I could help with selftests. I have written a couple already. Let me prepare some this week and I will send them as proposals on the list. Thanks, Fernando. ^ permalink raw reply [flat|nested] 12+ messages in thread
* Re: nfnetlink_queue crashes kernel 2026-04-03 23:57 ` Fernando Fernandez Mancera @ 2026-04-04 9:40 ` Florian Westphal 2026-04-06 12:54 ` Fernando Fernandez Mancera 0 siblings, 1 reply; 12+ messages in thread From: Florian Westphal @ 2026-04-04 9:40 UTC (permalink / raw) To: Fernando Fernandez Mancera; +Cc: Scott Mitchell, netfilter-devel Fernando Fernandez Mancera <fmancera@suse.de> wrote: > On 4/3/26 3:45 PM, Florian Westphal wrote: > > Florian Westphal <fw@strlen.de> wrote: > > > A probably better fix is to make the rhashtable perqueue, which is > > > much more intrusive at this late stage. > > > > Tentative patch to do this, still misses selftest extensions: > > > > I could help with selftests. I have written a couple already. Let me prepare > some this week and I will send them as proposals on the list. Thanks Fernando, much appreciated. This will be hard to trigger, the autoresize means that we'll typically not have two entries per bucket. What might help is to add a mode to nf_queue.c to: 1. send out-of-order-verdicts 2. send *bogus* verdicts that are expected to fail w. -ENOENT. I had a go at adding a stress test but its not triggering for me even if i run it for 10m. I'm attaching what I had: selftests: nft_queue.sh: add a parallel stress test XXX: Not complete, should extend nf_queue.c to allow OOO verdicts + bogus verdicts to increase likelyhood of accessing already-freed objects in the hash table. Signed-off-by: Florian Westphal <fw@strlen.de> diff --git a/tools/testing/selftests/net/netfilter/nft_queue.sh b/tools/testing/selftests/net/netfilter/nft_queue.sh index ea766bdc5d04..c05f2e5fef0b 100755 --- a/tools/testing/selftests/net/netfilter/nft_queue.sh +++ b/tools/testing/selftests/net/netfilter/nft_queue.sh @@ -11,6 +11,7 @@ ret=0 timeout=5 SCTP_TEST_TIMEOUT=60 +STRESS_TEST_TIMEOUT=300 cleanup() { @@ -719,6 +720,64 @@ EOF fi } +check_tainted() +{ + local msg="$1" + + if [ "$tainted_then" -ne 0 ];then + return + fi + + read tainted_now < /proc/sys/kernel/tainted + if [ "$tainted_now" -eq 0 ];then + echo "PASS: $msg" + else + echo "TAINT: $msg" + dmesg + ret=1 + fi +} + +test_queue_stress() +{ + read tainted_then < /proc/sys/kernel/tainted + local i + + ip netns exec "$nsrouter" nft -f /dev/stdin <<EOF +flush ruleset +table inet t { + chain forward { + type filter hook forward priority 0; policy accept; + + queue flags bypass to numgen random mod 8 + } +} +EOF + timeout "$STRESS_TEST_TIMEOUT" ip netns exec "$ns2" socat -u UDP-LISTEN:12345,fork,pf=ipv4 STDOUT > /dev/null & + timeout "$STRESS_TEST_TIMEOUT" ip netns exec "$ns3" socat -u UDP-LISTEN:12345,fork,pf=ipv4 STDOUT > /dev/null & + + for i in $(seq 0 7); do + ip netns exec "$nsrouter" timeout "$STRESS_TEST_TIMEOUT" ./nf_queue -q $i -t 2 > /dev/null & + done + + ip netns exec "$ns1" timeout "$STRESS_TEST_IMEOUT" ping -q -f 10.0.2.99 > /dev/null 2>&1 & + ip netns exec "$ns1" timeout "$STRESS_TEST_TIMEOUT" ping -q -f 10.0.3.99 > /dev/null 2>&1 & + ip netns exec "$ns1" timeout "$STRESS_TEST_TIMEOUT" ping -q -f "dead:2::99" > /dev/null 2>&1 & + ip netns exec "$ns1" timeout "$STRESS_TEST_TIMEOUT" ping -q -f "dead:3::99" > /dev/null 2>&1 & + + busywait "$BUSYWAIT_TIMEOUT" udp_listener_ready "$ns2" 12345 + busywait "$BUSYWAIT_TIMEOUT" udp_listener_ready "$ns3" 12345 + + for i in $(seq 1 4);do + ip netns exec "$ns1" timeout "$STRESS_TEST_TIMEOUT" socat -u STDIN UDP-DATAGRAM:10.0.2.99:12345 < /dev/zero > /dev/null & + ip netns exec "$ns1" timeout "$STRESS_TEST_TIMEOUT" socat -u STDIN UDP-DATAGRAM:10.0.3.99:12345 < /dev/zero > /dev/null & + done + + wait + + check_tainted "concurrent queueing" +} + test_queue_removal() { read tainted_then < /proc/sys/kernel/tainted @@ -742,18 +801,7 @@ EOF ip netns exec "$ns1" nft flush ruleset - if [ "$tainted_then" -ne 0 ];then - return - fi - - read tainted_now < /proc/sys/kernel/tainted - if [ "$tainted_now" -eq 0 ];then - echo "PASS: queue program exiting while packets queued" - else - echo "TAINT: queue program exiting while packets queued" - dmesg - ret=1 - fi + check_tainted "queue program exiting while packets queued" } ip netns exec "$nsrouter" sysctl net.ipv6.conf.all.forwarding=1 > /dev/null @@ -799,6 +847,7 @@ test_sctp_forward test_sctp_output test_udp_nat_race test_udp_gro_ct +test_queue_stress # should be last, adds vrf device in ns1 and changes routes test_icmp_vrf ^ permalink raw reply related [flat|nested] 12+ messages in thread
* Re: nfnetlink_queue crashes kernel 2026-04-04 9:40 ` Florian Westphal @ 2026-04-06 12:54 ` Fernando Fernandez Mancera 2026-04-06 17:10 ` Florian Westphal 0 siblings, 1 reply; 12+ messages in thread From: Fernando Fernandez Mancera @ 2026-04-06 12:54 UTC (permalink / raw) To: Florian Westphal; +Cc: Scott Mitchell, netfilter-devel On 4/4/26 11:40 AM, Florian Westphal wrote: > Fernando Fernandez Mancera <fmancera@suse.de> wrote: >> On 4/3/26 3:45 PM, Florian Westphal wrote: >>> Florian Westphal <fw@strlen.de> wrote: >>>> A probably better fix is to make the rhashtable perqueue, which is >>>> much more intrusive at this late stage. >>> >>> Tentative patch to do this, still misses selftest extensions: >>> >> >> I could help with selftests. I have written a couple already. Let me prepare >> some this week and I will send them as proposals on the list. > > Thanks Fernando, much appreciated. > This will be hard to trigger, the autoresize means that we'll typically > not have two entries per bucket. > > What might help is to add a mode to nf_queue.c to: > 1. send out-of-order-verdicts > 2. send *bogus* verdicts that are expected to > fail w. -ENOENT. > > I had a go at adding a stress test but its not > triggering for me even if i run it for 10m. Bingo! I modified the test you attached a bit to include your suggestions and I got the splat on virtme-ng when running the test. See the whole splat attached below. I am going to test your proposed patch and check out the results. In addition I will do some cleanup and send this as a formal nf-next patch. [ 283.183815] ================================================================== [ 283.184508] BUG: KASAN: slab-use-after-free in nfqnl_recv_verdict+0x149a/0x1a60 [nfnetlink_queue] [ 283.184508] Read of size 8 at addr ffff88800f151b10 by task nf_queue/685 [ 283.184508] [ 283.184508] CPU: 10 UID: 0 PID: 685 Comm: nf_queue Not tainted 7.0.0-rc6-virtme #2 PREEMPT(lazy) [ 283.184508] Hardware name: Bochs Bochs, BIOS Bochs 01/01/2011 [ 283.184508] Call Trace: [ 283.184508] <TASK> [ 283.184508] dump_stack_lvl+0x4d/0x70 [ 283.184508] print_report+0x170/0x4f3 [ 283.184508] ? __pfx__raw_spin_lock_irqsave+0x10/0x10 [ 283.184508] ? __sys_sendto+0x342/0x390 [ 283.184508] kasan_report+0xda/0x110 [ 283.184508] ? nfqnl_recv_verdict+0x149a/0x1a60 [nfnetlink_queue] [ 283.184508] ? nfqnl_recv_verdict+0x149a/0x1a60 [nfnetlink_queue] [ 283.184508] nfqnl_recv_verdict+0x149a/0x1a60 [nfnetlink_queue] [ 283.184508] ? __pfx_nfqnl_recv_verdict+0x10/0x10 [nfnetlink_queue] [ 283.184508] ? unwind_next_frame+0x3c4/0x1f70 [ 283.184508] ? __is_insn_slot_addr+0x8c/0x100 [ 283.184508] ? is_module_text_address+0x111/0x180 [ 283.184508] ? __pfx_nfqnl_recv_verdict+0x10/0x10 [nfnetlink_queue] [ 283.184508] nfnetlink_rcv_msg+0x455/0x840 [nfnetlink] [ 283.184508] ? __pfx_nfnetlink_rcv_msg+0x10/0x10 [nfnetlink] [ 283.184508] ? __kasan_slab_alloc+0x63/0x80 [ 283.184508] ? kmem_cache_alloc_node_noprof+0x119/0x3c0 [ 283.184508] ? kmalloc_reserve+0x101/0x2b0 [ 283.184508] ? __alloc_skb+0x11e/0x820 [ 283.184508] ? netlink_sendmsg+0x503/0xbb0 [ 283.184508] ? __sys_sendto+0x342/0x390 [ 283.184508] ? __x64_sys_sendto+0xe4/0x1f0 [ 283.184508] ? do_syscall_64+0xe2/0xf80 [ 283.184508] ? entry_SYSCALL_64_after_hwframe+0x77/0x7f [ 283.184508] netlink_rcv_skb+0x126/0x380 [ 283.184508] ? __pfx_nfnetlink_rcv_msg+0x10/0x10 [nfnetlink] [ 283.184508] ? __pfx_netlink_rcv_skb+0x10/0x10 [ 283.184508] ? __asan_memset+0x27/0x50 [ 283.184508] nfnetlink_rcv+0x166/0x4a0 [nfnetlink] [ 283.184508] ? __pfx___netlink_lookup+0x10/0x10 [ 283.184508] ? __pfx_nfnetlink_rcv+0x10/0x10 [nfnetlink] [ 283.184508] netlink_unicast+0x614/0x830 [ 283.184508] ? __pfx_netlink_unicast+0x10/0x10 [ 283.184508] ? __pfx___alloc_skb+0x10/0x10 [ 283.184508] netlink_sendmsg+0x6d0/0xbb0 [ 283.184508] ? __pfx_netlink_sendmsg+0x10/0x10 [ 283.184508] ? _copy_from_user+0x2d/0x80 [ 283.184508] __sys_sendto+0x342/0x390 [ 283.184508] ? __pfx___sys_sendto+0x10/0x10 [ 283.184508] __x64_sys_sendto+0xe4/0x1f0 [ 283.184508] ? fpregs_assert_state_consistent+0x5b/0xf0 [ 283.184508] do_syscall_64+0xe2/0xf80 [ 283.184508] ? __sysvec_call_function+0x20/0x280 [ 283.184508] ? exc_page_fault+0x6f/0xc0 [ 283.184508] entry_SYSCALL_64_after_hwframe+0x77/0x7f [ 283.184508] RIP: 0033:0x7f49aacf0c5e [ 283.184508] Code: 4d 89 d8 e8 34 bd 00 00 4c 8b 5d f8 41 8b 93 08 03 00 00 59 5e 48 83 f8 fc 74 11 c9 c3 0f 1f 80 00 00 00 00 48 8b 45 10 0f 05 <c9> c3 83 e2 39 83 fa 08 75 e7 e8 13 ff ff ff 0f 1f 00 f3 0f 1e fa [ 283.184508] RSP: 002b:00007fffdfd753c0 EFLAGS: 00000202 ORIG_RAX: 000000000000002c [ 283.184508] RAX: ffffffffffffffda RBX: 0000000000000000 RCX: 00007f49aacf0c5e [ 283.184508] RDX: 0000000000000020 RSI: 000000000c085010 RDI: 0000000000000005 [ 283.184508] RBP: 00007fffdfd753d0 R08: 00007f49aae78980 R09: 000000000000000c [ 283.184508] R10: 0000000000000000 R11: 0000000000000202 R12: 00007fffdfd755f8 [ 283.184508] R13: 0000000000000007 R14: 00007f49aaedc000 R15: 0000000000403df0 [ 283.184508] </TASK> [ 283.184508] [ 283.184508] Allocated by task 709: [ 283.184508] kasan_save_stack+0x30/0x50 [ 283.184508] kasan_save_track+0x14/0x30 [ 283.184508] __kasan_kmalloc+0x7f/0x90 [ 283.184508] __kmalloc_noprof+0x180/0x4b0 [ 283.184508] nf_queue+0x10d/0x1700 [ 283.184508] nf_hook_slow+0x13f/0x1e0 [ 283.184508] ip_forward+0x1637/0x2010 [ 283.184508] ip_rcv+0x2e8/0x360 [ 283.184508] __netif_receive_skb_one_core+0x14f/0x1b0 [ 283.184508] process_backlog+0x1ea/0x5e0 [ 283.184508] __napi_poll+0x9d/0x4a0 [ 283.184508] net_rx_action+0x980/0xfb0 [ 283.184508] handle_softirqs+0x18e/0x520 [ 283.184508] do_softirq+0x42/0x60 [ 283.184508] __local_bh_enable_ip+0x62/0x70 [ 283.184508] __dev_queue_xmit+0x903/0x3390 [ 283.184508] ip_finish_output2+0x55d/0x1cb0 [ 283.184508] ip_do_fragment+0x13fb/0x1f80 [ 283.184508] __ip_finish_output+0x536/0xb10 [ 283.184508] ip_output+0x17a/0x2f0 [ 283.184508] ip_send_skb+0x113/0x150 [ 283.184508] udp_send_skb+0x7c9/0xff0 [ 283.184508] udp_sendmsg+0x1394/0x1fa0 [ 283.184508] __sys_sendto+0x32b/0x390 [ 283.184508] __x64_sys_sendto+0xe4/0x1f0 [ 283.184508] do_syscall_64+0xe2/0xf80 [ 283.184508] entry_SYSCALL_64_after_hwframe+0x77/0x7f [ 283.184508] [ 283.184508] Freed by task 709: [ 283.184508] kasan_save_stack+0x30/0x50 [ 283.184508] kasan_save_track+0x14/0x30 [ 283.184508] kasan_save_free_info+0x3b/0x70 [ 283.184508] __kasan_slab_free+0x47/0x70 [ 283.184508] kfree+0x147/0x3b0 [ 283.184508] nf_queue+0x865/0x1700 [ 283.184508] nf_hook_slow+0x13f/0x1e0 [ 283.184508] ip_forward+0x1637/0x2010 [ 283.184508] ip_rcv+0x2e8/0x360 [ 283.184508] __netif_receive_skb_one_core+0x14f/0x1b0 [ 283.184508] process_backlog+0x1ea/0x5e0 [ 283.184508] __napi_poll+0x9d/0x4a0 [ 283.184508] net_rx_action+0x980/0xfb0 [ 283.184508] handle_softirqs+0x18e/0x520 [ 283.184508] do_softirq+0x42/0x60 [ 283.184508] __local_bh_enable_ip+0x62/0x70 [ 283.184508] __dev_queue_xmit+0x903/0x3390 [ 283.184508] ip_finish_output2+0x55d/0x1cb0 [ 283.184508] ip_do_fragment+0x13fb/0x1f80 [ 283.184508] __ip_finish_output+0x536/0xb10 [ 283.184508] ip_output+0x17a/0x2f0 [ 283.184508] ip_send_skb+0x113/0x150 [ 283.184508] udp_send_skb+0x7c9/0xff0 [ 283.184508] udp_sendmsg+0x1394/0x1fa0 [ 283.184508] __sys_sendto+0x32b/0x390 [ 283.184508] __x64_sys_sendto+0xe4/0x1f0 [ 283.184508] do_syscall_64+0xe2/0xf80 [ 283.184508] entry_SYSCALL_64_after_hwframe+0x77/0x7f [ 283.184508] [ 283.184508] The buggy address belongs to the object at ffff88800f151b00 [ 283.184508] which belongs to the cache kmalloc-128 of size 128 [ 283.184508] The buggy address is located 16 bytes inside of [ 283.184508] freed 128-byte region [ffff88800f151b00, ffff88800f151b80) [ 283.184508] [ 283.184508] The buggy address belongs to the physical page: [ 283.184508] page: refcount:0 mapcount:0 mapping:0000000000000000 index:0xffff88800f151300 pfn:0xf150 [ 283.184508] head: order:1 mapcount:0 entire_mapcount:0 nr_pages_mapped:0 pincount:0 [ 283.184508] flags: 0x80000000000240(workingset|head|node=0|zone=1) [ 283.184508] page_type: f5(slab) [ 283.184508] raw: 0080000000000240 ffff888001048a00 ffffea00001b3d10 ffffea00000e7190 [ 283.184508] raw: ffff88800f151300 0000000000200005 00000000f5000000 0000000000000000 [ 283.184508] head: 0080000000000240 ffff888001048a00 ffffea00001b3d10 ffffea00000e7190 [ 283.184508] head: ffff88800f151300 0000000000200005 00000000f5000000 0000000000000000 [ 283.184508] head: 0080000000000001 ffffea00003c5401 00000000ffffffff 00000000ffffffff [ 283.184508] head: 0000000000000000 0000000000000000 00000000ffffffff 0000000000000000 [ 283.184508] page dumped because: kasan: bad access detected [ 283.184508] [ 283.184508] Memory state around the buggy address: [ 283.184508] ffff88800f151a00: fa fb fb fb fb fb fb fb fb fb fb fb fb fb fb fb [ 283.184508] ffff88800f151a80: fc fc fc fc fc fc fc fc fc fc fc fc fc fc fc fc [ 283.184508] >ffff88800f151b00: fa fb fb fb fb fb fb fb fb fb fb fb fb fb fb fb [ 283.184508] ^ [ 283.184508] ffff88800f151b80: fc fc fc fc fc fc fc fc fc fc fc fc fc fc fc fc [ 283.184508] ffff88800f151c00: fa fb fb fb fb fb fb fb fb fb fb fb fb fb fb fb [ 283.184508] ================================================================== ^ permalink raw reply [flat|nested] 12+ messages in thread
* Re: nfnetlink_queue crashes kernel 2026-04-06 12:54 ` Fernando Fernandez Mancera @ 2026-04-06 17:10 ` Florian Westphal 2026-04-06 20:04 ` Fernando Fernandez Mancera 0 siblings, 1 reply; 12+ messages in thread From: Florian Westphal @ 2026-04-06 17:10 UTC (permalink / raw) To: Fernando Fernandez Mancera; +Cc: Scott Mitchell, netfilter-devel Fernando Fernandez Mancera <fmancera@suse.de> wrote: > On 4/4/26 11:40 AM, Florian Westphal wrote: > > I had a go at adding a stress test but its not > > triggering for me even if i run it for 10m. > > Bingo! I modified the test you attached a bit to include your > suggestions and I got the splat on virtme-ng when running the test. See > the whole splat attached below. > I am going to test your proposed patch and check out the results. In > addition I will do some cleanup and send this as a formal nf-next patch. Thanks Fernando, this helps a lot. Given the revert isn't clean and the per-queue ht fix isn't larger than the (modified...) revert I am still considering to apply the fix to nf instead of revert + nf-next defer. ^ permalink raw reply [flat|nested] 12+ messages in thread
* Re: nfnetlink_queue crashes kernel 2026-04-06 17:10 ` Florian Westphal @ 2026-04-06 20:04 ` Fernando Fernandez Mancera 0 siblings, 0 replies; 12+ messages in thread From: Fernando Fernandez Mancera @ 2026-04-06 20:04 UTC (permalink / raw) To: Florian Westphal; +Cc: Scott Mitchell, netfilter-devel On 4/6/26 7:10 PM, Florian Westphal wrote: > Fernando Fernandez Mancera <fmancera@suse.de> wrote: >> On 4/4/26 11:40 AM, Florian Westphal wrote: >>> I had a go at adding a stress test but its not >>> triggering for me even if i run it for 10m. >> >> Bingo! I modified the test you attached a bit to include your >> suggestions and I got the splat on virtme-ng when running the test. See >> the whole splat attached below. >> I am going to test your proposed patch and check out the results. In >> addition I will do some cleanup and send this as a formal nf-next patch. > > Thanks Fernando, this helps a lot. > No worries! > Given the revert isn't clean and the per-queue ht fix isn't larger > than the (modified...) revert I am still considering to apply the fix > to nf instead of revert + nf-next defer. Then I will send it to nf tree instead. I don't have a strong opinion. Thanks! Fernando. ^ permalink raw reply [flat|nested] 12+ messages in thread
* Re: nfnetlink_queue crashes kernel 2026-04-03 13:45 ` Florian Westphal 2026-04-03 15:55 ` Scott Mitchell 2026-04-03 23:57 ` Fernando Fernandez Mancera @ 2026-04-06 14:01 ` Fernando Fernandez Mancera 2 siblings, 0 replies; 12+ messages in thread From: Fernando Fernandez Mancera @ 2026-04-06 14:01 UTC (permalink / raw) To: Florian Westphal, Scott Mitchell; +Cc: netfilter-devel On 4/3/26 3:45 PM, Florian Westphal wrote: > Florian Westphal <fw@strlen.de> wrote: >> A probably better fix is to make the rhashtable perqueue, which is >> much more intrusive at this late stage. > > Tentative patch to do this, still misses selftest extensions: > This looks good from my side. I cannot reproduce the splat when this patch is applied. ^ permalink raw reply [flat|nested] 12+ messages in thread
* Re: nfnetlink_queue crashes kernel 2026-04-03 12:22 nfnetlink_queue crashes kernel Florian Westphal 2026-04-03 13:45 ` Florian Westphal @ 2026-04-03 13:59 ` Florian Westphal 2026-04-03 14:02 ` Pablo Neira Ayuso 2 siblings, 0 replies; 12+ messages in thread From: Florian Westphal @ 2026-04-03 13:59 UTC (permalink / raw) To: Scott Mitchell; +Cc: netfilter-devel Florian Westphal <fw@strlen.de> wrote: > 2) a4400a5b343d ("netfilter: nfnetlink_queue: nfqnl_instance GFP_ATOMIC -> GFP_KERNEL_ACCOUNT allocation") should have updated the spinlock to use the _bh variant, if the queue exists we risk deadlock via softirq recursion. Nope, this one is fine, the instance lock is only taken from process context, but 1) still stands. ^ permalink raw reply [flat|nested] 12+ messages in thread
* Re: nfnetlink_queue crashes kernel 2026-04-03 12:22 nfnetlink_queue crashes kernel Florian Westphal 2026-04-03 13:45 ` Florian Westphal 2026-04-03 13:59 ` Florian Westphal @ 2026-04-03 14:02 ` Pablo Neira Ayuso 2 siblings, 0 replies; 12+ messages in thread From: Pablo Neira Ayuso @ 2026-04-03 14:02 UTC (permalink / raw) To: Florian Westphal; +Cc: Scott Mitchell, netfilter-devel On Fri, Apr 03, 2026 at 02:22:01PM +0200, Florian Westphal wrote: > Hello, > > nfnetlink_queue is currently broken in at least two ways. > 1). kernel will crash under pressure because > struct nf_queue_entry is freed via kfree, but parallel > cpu can still observe this while walking the (global) rhashtable. > > 2) a4400a5b343d ("netfilter: nfnetlink_queue: nfqnl_instance GFP_ATOMIC -> GFP_KERNEL_ACCOUNT allocation") should have updated the spinlock to use the _bh variant, if the queue exists we risk deadlock via softirq recursion. > > Minimal fix, that I am not a fan of: > > diff --git a/include/net/netfilter/nf_queue.h b/include/net/netfilter/nf_queue.h > --- a/include/net/netfilter/nf_queue.h > +++ b/include/net/netfilter/nf_queue.h > @@ -24,6 +24,7 @@ struct nf_queue_entry { > bool nf_ct_is_unconfirmed; > u16 size; /* sizeof(entry) + saved route keys */ > u16 queue_num; > + struct rcu_head head; > > /* extra space to store route keys */ > }; > diff --git a/net/netfilter/nf_queue.c b/net/netfilter/nf_queue.c > index 7f12e56e6e52..385d1fe704ae 100644 > --- a/net/netfilter/nf_queue.c > +++ b/net/netfilter/nf_queue.c > @@ -74,7 +74,7 @@ static void nf_queue_entry_release_refs(struct nf_queue_entry *entry) > void nf_queue_entry_free(struct nf_queue_entry *entry) > { > nf_queue_entry_release_refs(entry); > - kfree(entry); > + kfree_rcu(entry, head); > } > EXPORT_SYMBOL_GPL(nf_queue_entry_free); > > diff --git a/net/netfilter/nfnetlink_queue.c b/net/netfilter/nfnetlink_queue.c > index 47f7f62906e2..fc44ea4e5128 100644 > --- a/net/netfilter/nfnetlink_queue.c > +++ b/net/netfilter/nfnetlink_queue.c > @@ -190,7 +190,7 @@ instance_create(struct nfnl_queue_net *q, u_int16_t queue_num, u32 portid) > spin_lock_init(&inst->lock); > INIT_LIST_HEAD(&inst->queue_list); > > - spin_lock(&q->instances_lock); > + spin_lock_bh(&q->instances_lock); > if (instance_lookup(q, queue_num)) { > err = -EEXIST; > goto out_unlock; > @@ -204,7 +204,7 @@ instance_create(struct nfnl_queue_net *q, u_int16_t queue_num, u32 portid) > h = instance_hashfn(queue_num); > hlist_add_head_rcu(&inst->hlist, &q->instance_table[h]); > > - spin_unlock(&q->instances_lock); > + spin_unlock_bh(&q->instances_lock); > > return inst; > > > > > A probably better fix is to make the rhashtable perqueue, which is > much more intrusive at this late stage. > > I prefer to revert both changes and not accept a reworked hashtable > until we have an extension to nft_queue.sh selftest. +1, then take a bit more time to get this right. ^ permalink raw reply [flat|nested] 12+ messages in thread
end of thread, other threads:[~2026-04-06 20:04 UTC | newest] Thread overview: 12+ messages (download: mbox.gz follow: Atom feed -- links below jump to the message on this page -- 2026-04-03 12:22 nfnetlink_queue crashes kernel Florian Westphal 2026-04-03 13:45 ` Florian Westphal 2026-04-03 15:55 ` Scott Mitchell 2026-04-03 19:14 ` Florian Westphal 2026-04-03 23:57 ` Fernando Fernandez Mancera 2026-04-04 9:40 ` Florian Westphal 2026-04-06 12:54 ` Fernando Fernandez Mancera 2026-04-06 17:10 ` Florian Westphal 2026-04-06 20:04 ` Fernando Fernandez Mancera 2026-04-06 14:01 ` Fernando Fernandez Mancera 2026-04-03 13:59 ` Florian Westphal 2026-04-03 14:02 ` Pablo Neira Ayuso
This is a public inbox, see mirroring instructions for how to clone and mirror all data and code used for this inbox