* [PATCH v7] netfilter: nfnetlink_queue: optimize verdict lookup with hash table
@ 2026-01-23 13:54 scott.k.mitch1
2026-01-23 14:58 ` Florian Westphal
0 siblings, 1 reply; 3+ messages in thread
From: scott.k.mitch1 @ 2026-01-23 13:54 UTC (permalink / raw)
To: netfilter-devel; +Cc: pablo, fw, Scott Mitchell
From: Scott Mitchell <scott.k.mitch1@gmail.com>
The current implementation uses a linear list to find queued packets by
ID when processing verdicts from userspace. With large queue depths and
out-of-order verdicting, this O(n) lookup becomes a significant
bottleneck, causing userspace verdict processing to dominate CPU time.
Replace the linear search with a hash table for O(1) average-case
packet lookup by ID. A global rhashtable spanning all network
namespaces attributes hash bucket memory to kernel but is subject to
fixed upper bound.
Signed-off-by: Scott Mitchell <scott.k.mitch1@gmail.com>
---
Changes in v7:
- Use global rhashtable instead of per-queue hash.
- Split previous patch 1 (no longer necessary for rhashtable)
into independent patch series.
Changes in v6:
- Split into 2-patch series
- Patch 1: Refactor locking to allow GFP_KERNEL_ACCOUNT allocation in
instance_create() by dropping RCU lock after instance_lookup() and
peer_portid verification (Florian Westphal)
- Patch 2: Remove UAPI for hash size, automatic resize, attribute
memory to cgroup.
Changes in v5:
- Use GFP_ATOMIC with kvmalloc_array instead of GFP_KERNEL_ACCOUNT due to
rcu_read_lock held in nfqnl_recv_config. Add comment explaining that
GFP_KERNEL_ACCOUNT would require lock refactoring (Florian Westphal)
Changes in v4:
- Fix sleeping while atomic bug: allocate hash table before taking
spinlock in instance_create() (syzbot)
Changes in v3:
- Simplify hash function to use direct masking (id & mask) instead of
hash_32() for better cache locality with sequential IDs (Eric Dumazet)
Changes in v2:
- Use kvcalloc/kvfree with GFP_KERNEL_ACCOUNT to support larger hash
tables with vmalloc fallback (Florian Westphal)
- Remove incorrect comment about concurrent resizes - nfnetlink subsystem
mutex already serializes config operations (Florian Westphal)
- Fix style: remove unnecessary braces around single-line if (Florian Westphal)
include/net/netfilter/nf_queue.h | 3 +
net/netfilter/nfnetlink_queue.c | 129 ++++++++++++++++++++++++++-----
2 files changed, 113 insertions(+), 19 deletions(-)
diff --git a/include/net/netfilter/nf_queue.h b/include/net/netfilter/nf_queue.h
index 4aeffddb7586..e6803831d6af 100644
--- a/include/net/netfilter/nf_queue.h
+++ b/include/net/netfilter/nf_queue.h
@@ -6,11 +6,13 @@
#include <linux/ipv6.h>
#include <linux/jhash.h>
#include <linux/netfilter.h>
+#include <linux/rhashtable-types.h>
#include <linux/skbuff.h>
/* Each queued (to userspace) skbuff has one of these. */
struct nf_queue_entry {
struct list_head list;
+ struct rhash_head hash_node;
struct sk_buff *skb;
unsigned int id;
unsigned int hook_index; /* index in hook_entries->hook[] */
@@ -20,6 +22,7 @@ struct nf_queue_entry {
#endif
struct nf_hook_state state;
u16 size; /* sizeof(entry) + saved route keys */
+ u16 queue_num;
/* extra space to store route keys */
};
diff --git a/net/netfilter/nfnetlink_queue.c b/net/netfilter/nfnetlink_queue.c
index 8b7b39d8a109..e7372955c5a8 100644
--- a/net/netfilter/nfnetlink_queue.c
+++ b/net/netfilter/nfnetlink_queue.c
@@ -30,6 +30,8 @@
#include <linux/netfilter/nf_conntrack_common.h>
#include <linux/list.h>
#include <linux/cgroup-defs.h>
+#include <linux/rhashtable.h>
+#include <linux/jhash.h>
#include <net/gso.h>
#include <net/sock.h>
#include <net/tcp_states.h>
@@ -47,6 +49,8 @@
#endif
#define NFQNL_QMAX_DEFAULT 1024
+#define NFQNL_HASH_MIN 1024
+#define NFQNL_HASH_MAX 1048576
/* We're using struct nlattr which has 16bit nla_len. Note that nla_len
* includes the header length. Thus, the maximum packet length that we
@@ -56,6 +60,16 @@
*/
#define NFQNL_MAX_COPY_RANGE (0xffff - NLA_HDRLEN)
+/* Composite key for packet lookup: (net, queue_num, packet_id) */
+struct nfqnl_packet_key {
+ struct net *net;
+ u32 packet_id;
+ u16 queue_num;
+};
+
+/* Global rhashtable - one for entire system, all netns */
+static struct rhashtable nfqnl_packet_map __read_mostly;
+
struct nfqnl_instance {
struct hlist_node hlist; /* global list of queues */
struct rcu_head rcu;
@@ -100,6 +114,45 @@ static inline u_int8_t instance_hashfn(u_int16_t queue_num)
return ((queue_num >> 8) ^ queue_num) % INSTANCE_BUCKETS;
}
+/* Extract composite key from nf_queue_entry for hashing */
+static u32 nfqnl_packet_obj_hashfn(const void *data, u32 len, u32 seed)
+{
+ const struct nf_queue_entry *entry = data;
+ struct nfqnl_packet_key key;
+
+ /* Zero entire struct including padding to ensure deterministic hashing */
+ memset(&key, 0, sizeof(key));
+ key.net = entry->state.net;
+ key.packet_id = entry->id;
+ key.queue_num = entry->queue_num;
+
+ /* jhash2 requires size to be a multiple of sizeof(u32) */
+ BUILD_BUG_ON(sizeof(key) % sizeof(u32) != 0);
+ return jhash2((u32 *)&key, sizeof(key) / sizeof(u32), seed);
+}
+
+/* Compare stack-allocated key against entry */
+static int nfqnl_packet_obj_cmpfn(struct rhashtable_compare_arg *arg,
+ const void *obj)
+{
+ const struct nfqnl_packet_key *key = arg->key;
+ const struct nf_queue_entry *entry = obj;
+
+ return entry->state.net != key->net ||
+ entry->queue_num != key->queue_num ||
+ entry->id != key->packet_id;
+}
+
+static const struct rhashtable_params nfqnl_rhashtable_params = {
+ .head_offset = offsetof(struct nf_queue_entry, hash_node),
+ .key_len = sizeof(struct nfqnl_packet_key),
+ .obj_hashfn = nfqnl_packet_obj_hashfn,
+ .obj_cmpfn = nfqnl_packet_obj_cmpfn,
+ .automatic_shrinking = true,
+ .min_size = NFQNL_HASH_MIN,
+ .max_size = NFQNL_HASH_MAX,
+};
+
static struct nfqnl_instance *
instance_lookup(struct nfnl_queue_net *q, u_int16_t queue_num)
{
@@ -191,33 +244,49 @@ instance_destroy(struct nfnl_queue_net *q, struct nfqnl_instance *inst)
spin_unlock(&q->instances_lock);
}
-static inline void
+static inline int
__enqueue_entry(struct nfqnl_instance *queue, struct nf_queue_entry *entry)
{
- list_add_tail(&entry->list, &queue->queue_list);
- queue->queue_total++;
+ int err;
+
+ entry->queue_num = queue->queue_num;
+
+ err = rhashtable_insert_fast(&nfqnl_packet_map, &entry->hash_node,
+ nfqnl_rhashtable_params);
+ if (unlikely(err))
+ return err;
+
+ list_add_tail(&entry->list, &queue->queue_list);
+ queue->queue_total++;
+
+ return 0;
}
static void
__dequeue_entry(struct nfqnl_instance *queue, struct nf_queue_entry *entry)
{
+ rhashtable_remove_fast(&nfqnl_packet_map, &entry->hash_node,
+ nfqnl_rhashtable_params);
list_del(&entry->list);
queue->queue_total--;
}
static struct nf_queue_entry *
-find_dequeue_entry(struct nfqnl_instance *queue, unsigned int id)
+find_dequeue_entry(struct nfqnl_instance *queue, unsigned int id,
+ struct net *net)
{
- struct nf_queue_entry *entry = NULL, *i;
+ struct nfqnl_packet_key key;
+ struct nf_queue_entry *entry;
- spin_lock_bh(&queue->lock);
+ /* Zero entire struct including padding to ensure deterministic hashing */
+ memset(&key, 0, sizeof(key));
+ key.net = net;
+ key.packet_id = id;
+ key.queue_num = queue->queue_num;
- list_for_each_entry(i, &queue->queue_list, list) {
- if (i->id == id) {
- entry = i;
- break;
- }
- }
+ spin_lock_bh(&queue->lock);
+ entry = rhashtable_lookup_fast(&nfqnl_packet_map, &key,
+ nfqnl_rhashtable_params);
if (entry)
__dequeue_entry(queue, entry);
@@ -407,8 +476,7 @@ nfqnl_flush(struct nfqnl_instance *queue, nfqnl_cmpfn cmpfn, unsigned long data)
spin_lock_bh(&queue->lock);
list_for_each_entry_safe(entry, next, &queue->queue_list, list) {
if (!cmpfn || cmpfn(entry, data)) {
- list_del(&entry->list);
- queue->queue_total--;
+ __dequeue_entry(queue, entry);
nfqnl_reinject(entry, NF_DROP);
}
}
@@ -902,9 +970,25 @@ __nfqnl_enqueue_packet(struct net *net, struct nfqnl_instance *queue,
entry->id = ++queue->id_sequence;
*packet_id_ptr = htonl(entry->id);
+ /* Insert into hash BEFORE unicast. If failure don't send to userspace. */
+ err = __enqueue_entry(queue, entry);
+ if (unlikely(err)) {
+ if (queue->flags & NFQA_CFG_F_FAIL_OPEN) {
+ failopen = 1;
+ err = 0;
+ } else {
+ queue->queue_dropped++;
+ net_warn_ratelimited("nf_queue: hash insert failed: %d\n", err);
+ }
+ goto err_out_free_nskb;
+ }
+
/* nfnetlink_unicast will either free the nskb or add it to a socket */
err = nfnetlink_unicast(nskb, net, queue->peer_portid);
if (err < 0) {
+ /* Unicast failed - remove entry we just inserted */
+ __dequeue_entry(queue, entry);
+
if (queue->flags & NFQA_CFG_F_FAIL_OPEN) {
failopen = 1;
err = 0;
@@ -914,8 +998,6 @@ __nfqnl_enqueue_packet(struct net *net, struct nfqnl_instance *queue,
goto err_out_unlock;
}
- __enqueue_entry(queue, entry);
-
spin_unlock_bh(&queue->lock);
return 0;
@@ -1430,7 +1512,7 @@ static int nfqnl_recv_verdict(struct sk_buff *skb, const struct nfnl_info *info,
verdict = ntohl(vhdr->verdict);
- entry = find_dequeue_entry(queue, ntohl(vhdr->id));
+ entry = find_dequeue_entry(queue, ntohl(vhdr->id), info->net);
if (entry == NULL)
return -ENOENT;
@@ -1781,10 +1863,16 @@ static int __init nfnetlink_queue_init(void)
{
int status;
+ status = rhashtable_init(&nfqnl_packet_map, &nfqnl_rhashtable_params);
+ if (status < 0) {
+ pr_err("failed to init packet hash table\n");
+ return status;
+ }
+
status = register_pernet_subsys(&nfnl_queue_net_ops);
if (status < 0) {
pr_err("failed to register pernet ops\n");
- goto out;
+ goto cleanup_rhashtable;
}
netlink_register_notifier(&nfqnl_rtnl_notifier);
@@ -1809,7 +1897,8 @@ static int __init nfnetlink_queue_init(void)
cleanup_netlink_notifier:
netlink_unregister_notifier(&nfqnl_rtnl_notifier);
unregister_pernet_subsys(&nfnl_queue_net_ops);
-out:
+cleanup_rhashtable:
+ rhashtable_destroy(&nfqnl_packet_map);
return status;
}
@@ -1821,6 +1910,8 @@ static void __exit nfnetlink_queue_fini(void)
netlink_unregister_notifier(&nfqnl_rtnl_notifier);
unregister_pernet_subsys(&nfnl_queue_net_ops);
+ rhashtable_destroy(&nfqnl_packet_map);
+
rcu_barrier(); /* Wait for completion of call_rcu()'s */
}
--
2.39.5 (Apple Git-154)
^ permalink raw reply related [flat|nested] 3+ messages in thread* Re: [PATCH v7] netfilter: nfnetlink_queue: optimize verdict lookup with hash table
2026-01-23 13:54 [PATCH v7] netfilter: nfnetlink_queue: optimize verdict lookup with hash table scott.k.mitch1
@ 2026-01-23 14:58 ` Florian Westphal
2026-01-23 17:36 ` Scott Mitchell
0 siblings, 1 reply; 3+ messages in thread
From: Florian Westphal @ 2026-01-23 14:58 UTC (permalink / raw)
To: scott.k.mitch1; +Cc: netfilter-devel, pablo
scott.k.mitch1@gmail.com <scott.k.mitch1@gmail.com> wrote:
LGTM, just a few minor nits.
> diff --git a/include/net/netfilter/nf_queue.h b/include/net/netfilter/nf_queue.h
> index 4aeffddb7586..e6803831d6af 100644
> --- a/include/net/netfilter/nf_queue.h
> +++ b/include/net/netfilter/nf_queue.h
> @@ -6,11 +6,13 @@
> #include <linux/ipv6.h>
> #include <linux/jhash.h>
> #include <linux/netfilter.h>
> +#include <linux/rhashtable-types.h>
> #include <linux/skbuff.h>
>
> /* Each queued (to userspace) skbuff has one of these. */
> struct nf_queue_entry {
> struct list_head list;
> + struct rhash_head hash_node;
> struct sk_buff *skb;
> unsigned int id;
> unsigned int hook_index; /* index in hook_entries->hook[] */
> @@ -20,6 +22,7 @@ struct nf_queue_entry {
> #endif
> struct nf_hook_state state;
> u16 size; /* sizeof(entry) + saved route keys */
> + u16 queue_num;
>
> /* extra space to store route keys */
> };
> diff --git a/net/netfilter/nfnetlink_queue.c b/net/netfilter/nfnetlink_queue.c
> index 8b7b39d8a109..e7372955c5a8 100644
> --- a/net/netfilter/nfnetlink_queue.c
> +++ b/net/netfilter/nfnetlink_queue.c
> #define NFQNL_MAX_COPY_RANGE (0xffff - NLA_HDRLEN)
>
> +/* Composite key for packet lookup: (net, queue_num, packet_id) */
> +struct nfqnl_packet_key {
> + struct net *net;
possible_net_t net;
so we don't have this member for CONFIG_NET_NS=n
> + u32 packet_id;
> + u16 queue_num;
> +};
> +
> +/* Global rhashtable - one for entire system, all netns */
> +static struct rhashtable nfqnl_packet_map __read_mostly;
> +
> struct nfqnl_instance {
> struct hlist_node hlist; /* global list of queues */
> struct rcu_head rcu;
> @@ -100,6 +114,45 @@ static inline u_int8_t instance_hashfn(u_int16_t queue_num)
> return ((queue_num >> 8) ^ queue_num) % INSTANCE_BUCKETS;
> }
>
> +/* Extract composite key from nf_queue_entry for hashing */
> +static u32 nfqnl_packet_obj_hashfn(const void *data, u32 len, u32 seed)
> +{
> + const struct nf_queue_entry *entry = data;
> + struct nfqnl_packet_key key;
> +
> + /* Zero entire struct including padding to ensure deterministic hashing */
> + memset(&key, 0, sizeof(key));
> + key.net = entry->state.net;
write_pnet()
> + /* jhash2 requires size to be a multiple of sizeof(u32) */
> + BUILD_BUG_ON(sizeof(key) % sizeof(u32) != 0);
I wonder if this works on 32bit.
If not, alternative is to either add __align to the struct or
to resort to jhash_3words().
Up to you.
> +/* Compare stack-allocated key against entry */
> +static int nfqnl_packet_obj_cmpfn(struct rhashtable_compare_arg *arg,
> + const void *obj)
> +{
> + const struct nfqnl_packet_key *key = arg->key;
> + const struct nf_queue_entry *entry = obj;
> +
> + return entry->state.net != key->net ||
net_eq(entry->state.net, read_pnet( ...
> +static inline int
> __enqueue_entry(struct nfqnl_instance *queue, struct nf_queue_entry *entry)
> {
Is the inline keyword needed here?
> + memset(&key, 0, sizeof(key));
> + key.net = net;
> + key.packet_id = id;
> + key.queue_num = queue->queue_num;
Maybe its worth to add a small helper that fills the structure.
> + /* Insert into hash BEFORE unicast. If failure don't send to userspace. */
> + err = __enqueue_entry(queue, entry);
> + if (unlikely(err)) {
> + if (queue->flags & NFQA_CFG_F_FAIL_OPEN) {
> + failopen = 1;
> + err = 0;
> + } else {
> + queue->queue_dropped++;
> + net_warn_ratelimited("nf_queue: hash insert failed: %d\n", err);
> + }
> + goto err_out_free_nskb;
This repeated conditional is not so nice, is there a way to avoid it?
E.g. new common helper or via goto.
> + status = rhashtable_init(&nfqnl_packet_map, &nfqnl_rhashtable_params);
> + if (status < 0) {
> + pr_err("failed to init packet hash table\n");
Please remove this pr_err, kernel gets quite noisy when
it starts to run out of memory. The other pr_err()s init .init
are just historic artefacts.
^ permalink raw reply [flat|nested] 3+ messages in thread
end of thread, other threads:[~2026-01-23 17:37 UTC | newest]
Thread overview: 3+ messages (download: mbox.gz follow: Atom feed
-- links below jump to the message on this page --
2026-01-23 13:54 [PATCH v7] netfilter: nfnetlink_queue: optimize verdict lookup with hash table scott.k.mitch1
2026-01-23 14:58 ` Florian Westphal
2026-01-23 17:36 ` Scott Mitchell
This is an external index of several public inboxes,
see mirroring instructions on how to clone and mirror
all data and code used by this external index.